missing data analysis
This commit is contained in:
@@ -16,8 +16,8 @@ from speaking_styles import SPEAKING_STYLES
|
||||
|
||||
# %% Fixed Variables
|
||||
|
||||
# RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||
RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
||||
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||
# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ def parse_cli_args():
|
||||
parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
|
||||
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
|
||||
parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
|
||||
|
||||
# Only parse if running as script (not in Jupyter/interactive)
|
||||
try:
|
||||
@@ -58,7 +59,7 @@ def parse_cli_args():
|
||||
get_ipython() # noqa: F821 # type: ignore
|
||||
# Return namespace with all filters set to None
|
||||
no_filters = {f: None for f in FILTER_CONFIG}
|
||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None)
|
||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
|
||||
except NameError:
|
||||
args = parser.parse_args()
|
||||
# Parse JSON strings to lists
|
||||
@@ -174,6 +175,26 @@ if cli_args.sl_threshold is not None:
|
||||
else:
|
||||
print(" No straight-liners detected — no respondents removed.")
|
||||
|
||||
# %% Apply voice-ranking completeness filter (if specified)
|
||||
# Keeps only / excludes respondents who are missing the explicit voice
|
||||
# ranking question (QID98) despite completing the top-3 selection (QID36).
|
||||
if cli_args.voice_ranking_filter is not None:
|
||||
S.voice_ranking_filter = cli_args.voice_ranking_filter # Store on Survey so filter slug/description include it
|
||||
_vr_missing = S.get_top_3_voices_missing_ranking(_d)
|
||||
_vr_missing_ids = _vr_missing.select('_recordId')
|
||||
_n_before = _d.select(pl.len()).collect().item()
|
||||
|
||||
if cli_args.voice_ranking_filter == 'only-missing':
|
||||
print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
|
||||
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
|
||||
elif cli_args.voice_ranking_filter == 'exclude-missing':
|
||||
print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
|
||||
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
|
||||
|
||||
S.data_filtered = _d
|
||||
_n_after = _d.select(pl.len()).collect().item()
|
||||
print(f" {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
|
||||
|
||||
# Save to logical variable name for further analysis
|
||||
data = _d
|
||||
data.collect()
|
||||
|
||||
Reference in New Issue
Block a user