missing data analysis
This commit is contained in:
@@ -16,8 +16,8 @@ from speaking_styles import SPEAKING_STYLES
|
|||||||
|
|
||||||
# %% Fixed Variables
|
# %% Fixed Variables
|
||||||
|
|
||||||
# RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||||
RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
||||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||||
|
|
||||||
|
|
||||||
@@ -51,6 +51,7 @@ def parse_cli_args():
|
|||||||
parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||||
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
|
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
|
||||||
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
|
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
|
||||||
|
parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
|
||||||
|
|
||||||
# Only parse if running as script (not in Jupyter/interactive)
|
# Only parse if running as script (not in Jupyter/interactive)
|
||||||
try:
|
try:
|
||||||
@@ -58,7 +59,7 @@ def parse_cli_args():
|
|||||||
get_ipython() # noqa: F821 # type: ignore
|
get_ipython() # noqa: F821 # type: ignore
|
||||||
# Return namespace with all filters set to None
|
# Return namespace with all filters set to None
|
||||||
no_filters = {f: None for f in FILTER_CONFIG}
|
no_filters = {f: None for f in FILTER_CONFIG}
|
||||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None)
|
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
|
||||||
except NameError:
|
except NameError:
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
# Parse JSON strings to lists
|
# Parse JSON strings to lists
|
||||||
@@ -174,6 +175,26 @@ if cli_args.sl_threshold is not None:
|
|||||||
else:
|
else:
|
||||||
print(" No straight-liners detected — no respondents removed.")
|
print(" No straight-liners detected — no respondents removed.")
|
||||||
|
|
||||||
|
# %% Apply voice-ranking completeness filter (if specified)
|
||||||
|
# Keeps only / excludes respondents who are missing the explicit voice
|
||||||
|
# ranking question (QID98) despite completing the top-3 selection (QID36).
|
||||||
|
if cli_args.voice_ranking_filter is not None:
|
||||||
|
S.voice_ranking_filter = cli_args.voice_ranking_filter # Store on Survey so filter slug/description include it
|
||||||
|
_vr_missing = S.get_top_3_voices_missing_ranking(_d)
|
||||||
|
_vr_missing_ids = _vr_missing.select('_recordId')
|
||||||
|
_n_before = _d.select(pl.len()).collect().item()
|
||||||
|
|
||||||
|
if cli_args.voice_ranking_filter == 'only-missing':
|
||||||
|
print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
|
||||||
|
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
|
||||||
|
elif cli_args.voice_ranking_filter == 'exclude-missing':
|
||||||
|
print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
|
||||||
|
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
|
||||||
|
|
||||||
|
S.data_filtered = _d
|
||||||
|
_n_after = _d.select(pl.len()).collect().item()
|
||||||
|
print(f" {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
|
||||||
|
|
||||||
# Save to logical variable name for further analysis
|
# Save to logical variable name for further analysis
|
||||||
data = _d
|
data = _d
|
||||||
data.collect()
|
data.collect()
|
||||||
|
|||||||
1359
analysis_missing_voice_ranking.ipynb
Normal file
1359
analysis_missing_voice_ranking.ipynb
Normal file
File diff suppressed because one or more lines are too long
14
plots.py
14
plots.py
@@ -96,6 +96,11 @@ class QualtricsPlotsMixin:
|
|||||||
sl_threshold = getattr(self, 'sl_threshold', None)
|
sl_threshold = getattr(self, 'sl_threshold', None)
|
||||||
if sl_threshold is not None:
|
if sl_threshold is not None:
|
||||||
parts.append(f"SL-gte{sl_threshold}")
|
parts.append(f"SL-gte{sl_threshold}")
|
||||||
|
|
||||||
|
# Append voice ranking filter if set
|
||||||
|
vr_filter = getattr(self, 'voice_ranking_filter', None)
|
||||||
|
if vr_filter is not None:
|
||||||
|
parts.append(f"VR-{vr_filter}")
|
||||||
|
|
||||||
if not parts:
|
if not parts:
|
||||||
return "All_Respondents"
|
return "All_Respondents"
|
||||||
@@ -191,6 +196,15 @@ class QualtricsPlotsMixin:
|
|||||||
sl_threshold = getattr(self, 'sl_threshold', None)
|
sl_threshold = getattr(self, 'sl_threshold', None)
|
||||||
if sl_threshold is not None:
|
if sl_threshold is not None:
|
||||||
parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups")
|
parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups")
|
||||||
|
|
||||||
|
# Append voice ranking filter if set
|
||||||
|
vr_filter = getattr(self, 'voice_ranking_filter', None)
|
||||||
|
if vr_filter is not None:
|
||||||
|
vr_labels = {
|
||||||
|
'only-missing': 'ONLY respondents missing voice ranking (QID98)',
|
||||||
|
'exclude-missing': 'EXCLUDING respondents missing voice ranking (QID98)',
|
||||||
|
}
|
||||||
|
parts.append(f"VOICE RANKING: {vr_labels.get(vr_filter, vr_filter)}")
|
||||||
|
|
||||||
if not parts:
|
if not parts:
|
||||||
# No filters active - return just sample size (or empty string if no sample size)
|
# No filters active - return just sample size (or empty string if no sample size)
|
||||||
|
|||||||
@@ -179,10 +179,25 @@ def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> li
|
|||||||
'filters': {'industry': [industry]}
|
'filters': {'industry': [industry]}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Voice ranking completeness filter
|
||||||
|
# These use a special flag rather than demographic filters, so we store
|
||||||
|
# the mode in a dedicated key that run_report passes as --voice-ranking-filter.
|
||||||
|
if not category or category in ['all_filters', 'voice_ranking']:
|
||||||
|
combinations.append({
|
||||||
|
'name': 'VoiceRanking-OnlyMissing',
|
||||||
|
'filters': {},
|
||||||
|
'voice_ranking_filter': 'only-missing',
|
||||||
|
})
|
||||||
|
combinations.append({
|
||||||
|
'name': 'VoiceRanking-ExcludeMissing',
|
||||||
|
'filters': {},
|
||||||
|
'voice_ranking_filter': 'exclude-missing',
|
||||||
|
})
|
||||||
|
|
||||||
return combinations
|
return combinations
|
||||||
|
|
||||||
|
|
||||||
def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None) -> bool:
|
def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
|
||||||
"""
|
"""
|
||||||
Run the report script with given filters.
|
Run the report script with given filters.
|
||||||
|
|
||||||
@@ -191,6 +206,9 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_thresh
|
|||||||
name: Name for this filter combination (used for .txt description file)
|
name: Name for this filter combination (used for .txt description file)
|
||||||
dry_run: If True, just print command without running
|
dry_run: If True, just print command without running
|
||||||
sl_threshold: If set, exclude respondents with >= N straight-lined question groups
|
sl_threshold: If set, exclude respondents with >= N straight-lined question groups
|
||||||
|
voice_ranking_filter: If set, filter by voice ranking completeness.
|
||||||
|
'only-missing' keeps only respondents missing QID98 data,
|
||||||
|
'exclude-missing' removes them.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
True if successful, False otherwise
|
True if successful, False otherwise
|
||||||
@@ -205,6 +223,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_thresh
|
|||||||
if sl_threshold is not None:
|
if sl_threshold is not None:
|
||||||
cmd.extend(['--sl-threshold', str(sl_threshold)])
|
cmd.extend(['--sl-threshold', str(sl_threshold)])
|
||||||
|
|
||||||
|
# Pass voice ranking filter if specified
|
||||||
|
if voice_ranking_filter is not None:
|
||||||
|
cmd.extend(['--voice-ranking-filter', voice_ranking_filter])
|
||||||
|
|
||||||
for filter_name, values in filters.items():
|
for filter_name, values in filters.items():
|
||||||
if values:
|
if values:
|
||||||
cmd.extend([f'--{filter_name}', json.dumps(values)])
|
cmd.extend([f'--{filter_name}', json.dumps(values)])
|
||||||
@@ -235,7 +257,7 @@ def main():
|
|||||||
parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
|
parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--category',
|
'--category',
|
||||||
choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry'],
|
choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
|
||||||
default='all_filters',
|
default='all_filters',
|
||||||
help='Filter category to run combinations for (default: all_filters)'
|
help='Filter category to run combinations for (default: all_filters)'
|
||||||
)
|
)
|
||||||
@@ -259,7 +281,7 @@ def main():
|
|||||||
print("\nDRY RUN - Commands that would be executed:")
|
print("\nDRY RUN - Commands that would be executed:")
|
||||||
for combo in combinations:
|
for combo in combinations:
|
||||||
print(f"\n{combo['name']}:")
|
print(f"\n{combo['name']}:")
|
||||||
run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold)
|
run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Run each combination with progress bar
|
# Run each combination with progress bar
|
||||||
@@ -268,7 +290,7 @@ def main():
|
|||||||
|
|
||||||
for combo in tqdm(combinations, desc="Running reports", unit="filter"):
|
for combo in tqdm(combinations, desc="Running reports", unit="filter"):
|
||||||
tqdm.write(f"Running: {combo['name']}")
|
tqdm.write(f"Running: {combo['name']}")
|
||||||
if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold):
|
if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
|
||||||
successful += 1
|
successful += 1
|
||||||
else:
|
else:
|
||||||
failed.append(combo['name'])
|
failed.append(combo['name'])
|
||||||
|
|||||||
54
utils.py
54
utils.py
@@ -1115,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
|||||||
|
|
||||||
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
|
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
|
||||||
|
|
||||||
|
def get_top_3_voices_missing_ranking(
|
||||||
|
self, q: pl.LazyFrame
|
||||||
|
) -> pl.DataFrame:
|
||||||
|
"""Identify respondents who completed the top-3 voice selection (QID36)
|
||||||
|
but are missing the explicit ranking question (QID98).
|
||||||
|
|
||||||
|
These respondents picked 3 voices in the selection step and have
|
||||||
|
selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
|
||||||
|
ranking columns are null. This means ``get_top_3_voices()`` will
|
||||||
|
return all-null rows for them, causing plots like
|
||||||
|
``plot_most_ranked_1`` to undercount.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
q: The (optionally filtered) LazyFrame from ``load_data()``.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A collected ``pl.DataFrame`` with columns:
|
||||||
|
|
||||||
|
- ``_recordId`` – the respondent identifier
|
||||||
|
- ``3_Ranked`` – comma-separated text of the 3 voices they selected
|
||||||
|
- ``qid36_rank_cols`` – dict-like column with their QID36 selection-
|
||||||
|
order values (for reference; these are *not* preference ranks)
|
||||||
|
"""
|
||||||
|
# Get the top-3 ranking data (QID98-based)
|
||||||
|
top3, _ = self.get_top_3_voices(q)
|
||||||
|
top3_df = top3.collect()
|
||||||
|
|
||||||
|
ranking_cols = [c for c in top3_df.columns if c != '_recordId']
|
||||||
|
|
||||||
|
# Respondents where every QID98 ranking column is null
|
||||||
|
all_null_expr = pl.lit(True)
|
||||||
|
for col in ranking_cols:
|
||||||
|
all_null_expr = all_null_expr & pl.col(col).is_null()
|
||||||
|
|
||||||
|
missing_ids = top3_df.filter(all_null_expr).select('_recordId')
|
||||||
|
|
||||||
|
if missing_ids.height == 0:
|
||||||
|
return pl.DataFrame(schema={
|
||||||
|
'_recordId': pl.Utf8,
|
||||||
|
'3_Ranked': pl.Utf8,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Enrich with the 3_Ranked text from the 18→8→3 question
|
||||||
|
v_18_8_3, _ = self.get_18_8_3(q)
|
||||||
|
v_df = v_18_8_3.collect()
|
||||||
|
|
||||||
|
result = missing_ids.join(
|
||||||
|
v_df.select(['_recordId', '3_Ranked']),
|
||||||
|
on='_recordId',
|
||||||
|
how='left',
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
|
def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
|
||||||
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.
|
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.
|
||||||
|
|||||||
Reference in New Issue
Block a user