import marimo as mo
import polars as pl
import altair as alt
from theme import ColorPalette

def check_progress(data):
    """Check if all responses are complete based on 'progress' column."""
    if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
        return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
    
    return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"


def duration_validation(data):
    """Validate response durations to identify outliers."""
    # Identify any outliers in duration
    duration_stats = data.select(
        pl.col('duration').mean().alias('mean_duration'),
        pl.col('duration').std().alias('std_duration')
    ).collect()
    mean_duration = duration_stats['mean_duration'][0]
    std_duration = duration_stats['std_duration'][0]
    upper_outlier_threshold = mean_duration + 3 * std_duration
    lower_outlier_threshold = mean_duration - 3 * std_duration
    
    _d = data.with_columns(
        ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
    )
    
    # Show durations with outlier flag is true
    outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()

    if outlier_data.shape[0] == 0:
        return "### Duration Outliers: \n\n✅ No duration outliers detected"

    return f"""### Duration Outliers:
    
    **⚠️ Potential outliers detected based on response duration ⚠️**
    
    | Metric | Value |
    |--------|-------|
    | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
    | Standard Deviation of Duration | {std_duration:.2f} seconds |
    | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
    | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
    | Number of Outlier Responses | {outlier_data.shape[0]} |
    
    Outliers:
    
    {mo.ui.table(outlier_data)}
    
    
    **⚠️ NOTE: These have not been removed from the dataset ⚠️**
    
    """


def check_straight_liners(data, max_score=3):
    """
    Check for straight-lining behavior (selecting same value for all attributes).
    
    Args:
        data: Polars LazyFrame
        max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
    """
    import re

    # detect columns groups based on pattern SS_...__Vxx__Choice_y
    schema_names = data.collect_schema().names()
    
    # regex groupings
    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
    groups = {}
    
    for col in schema_names:
        match = pattern.search(col)
        if match:
            group_key = match.group(1)
            if group_key not in groups:
                groups[group_key] = []
            groups[group_key].append(col)
            
    # Filter for groups with multiple attributes/choices
    multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
    
    if not multi_attribute_groups:
        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."

    # Cast all involved columns to Int64 (strict=False) to handle potential string columns
    # This prevents "cannot compare string with numeric type" errors
    all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
    data = data.with_columns([
        pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
    ])

    # Build expressions
    expressions = []
    
    for key, cols in multi_attribute_groups.items():
        # Logic: 
        # 1. Create list of values
        # 2. Drop nulls
        # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
        
        list_expr = pl.concat_list(cols).list.drop_nulls()
        
        # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
        # If n_unique == 1, min() is the same as the single value.
        # If list is empty, min() is null, which is safe.
        safe_val = list_expr.list.min()
        
        is_straight = (
            (list_expr.list.len() > 0) & 
            (list_expr.list.n_unique() == 1) & 
            (safe_val <= max_score)
        ).alias(f"__is_straight__{key}")
        
        value_expr = safe_val.alias(f"__val__{key}")
        has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}")
        
        expressions.extend([is_straight, value_expr, has_data])
        
    # collect data with checks
    # We only need _recordId and the check columns
    # We do with_columns then select implicitly/explicitly via filter/select later.
    
    checked_data = data.with_columns(expressions).collect()
    
    # Process results into a nice table
    outliers = []
    
    for key, group_cols in multi_attribute_groups.items():
        flag_col = f"__is_straight__{key}"
        val_col = f"__val__{key}"
        
        filtered = checked_data.filter(pl.col(flag_col))
        
        if filtered.height > 0:
            # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
            # Assuming format ends with __Choice_X
            sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
            
            # Select relevant columns: Record ID, Value, and the sorted group columns
            subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
            
            for row in subset.iter_rows(named=True):
                # Create ordered list of values, using 'NaN' for missing data
                resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
                
                outliers.append({
                    "Record ID": row["_recordId"],
                    "Question Group": key,
                    "Value": row[val_col],
                    "Responses": str(resp_list)
                })

    if not outliers:
        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
        
    outlier_df = pl.DataFrame(outliers)

    # --- Analysis & Visualization ---
    
    total_respondents = checked_data.height
    
    # 1. & 3. Percentage Calculation
    group_stats = []
    value_dist_data = []

    # Calculate Straight-Liners for ALL groups found in Data
    # Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions)
    # Logic: For every group G: if G has data (len > 0), then G must be straight.
    # Also, the respondent must have answered at least one question group.
    
    conditions = []
    has_any_data_exprs = []
    
    for key in multi_attribute_groups.keys():
        flag_col = f"__is_straight__{key}"
        data_col = f"__has_data__{key}"
        
        # If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user.
        # Equivalent: (not has_data) OR is_straight
        cond = (~pl.col(data_col)) | pl.col(flag_col)
        conditions.append(cond)
        has_any_data_exprs.append(pl.col(data_col))

    all_straight_count = checked_data.filter(
        pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs)
    ).height
    all_straight_pct = (all_straight_count / total_respondents) * 100

    for key in multi_attribute_groups.keys():
        flag_col = f"__is_straight__{key}"
        val_col = f"__val__{key}"
        
        # Filter for straight-liners in this specific group
        sl_sub = checked_data.filter(pl.col(flag_col))
        count = sl_sub.height
        pct = (count / total_respondents) * 100
        
        group_stats.append({
            "Question Group": key,
            "Straight-Liner %": pct,
            "Count": count
        })
        
        # Get Value Distribution for this group's straight-liners
        if count > 0:
            # Group by the Value they straight-lined
            dist = sl_sub.group_by(val_col).agg(pl.len().alias("count"))
            for row in dist.iter_rows(named=True):
                 value_dist_data.append({
                     "Question Group": key,
                     "Value": row[val_col],
                     "Count": row["count"]
                 })

    stats_df = pl.DataFrame(group_stats)
    dist_df = pl.DataFrame(value_dist_data)
    
    # Plot 1: % of Responses with Straight-Liners per Question
    # Vertical bars with Count label on top
    base_pct = alt.Chart(stats_df).encode(
        x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending"))
    )
    
    bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode(
        y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")),
        tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"]
    )
    
    text_pct = base_pct.mark_text(dy=-10).encode(
        y=alt.Y("Straight-Liner %:Q"),
        text=alt.Text("Count")
    )
    
    chart_pct = (bars_pct + text_pct).properties(
        title="Share of Responses with Straight-Liners per Question", 
        width=800,
        height=300
    )
    
    # Plot 2: Value Distribution (Horizontal Stacked Normalized Bar)
    # Question Groups sorted by Total Count
    # Values stacked 1 (left) -> 5 (right)
    # Legend on top
    # Total count at bar end
    
    # Sort order for Y axis (Question Group) based on total Count (descending)
    # Explicitly calculate sort order from stats_df to ensure consistency across layers
    # High counts at the top
    sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list()
    
    # Base chart for Bars
    # Use JPMC-aligned colors (blues) instead of default categorical rainbow
    # Remove legend title as per plots.py style
    bars_dist = alt.Chart(dist_df).mark_bar().encode(
        y=alt.Y("Question Group", sort=sorted_groups),
        x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"),
        color=alt.Color("Value:O", 
                        title=None, # explicit removal of title like in plots.py
                        scale=alt.Scale(scheme="blues"), # Professional blue scale
                        legend=alt.Legend(orient="top", direction="horizontal")
                       ),
        order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right
        tooltip=["Question Group", "Value", "Count"]
    )
    
    # Text layer for Total Count (using stats_df which already has totals)
    # using same sort for Y
    text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode(
        y=alt.Y("Question Group", sort=sorted_groups),
        x=alt.datum(1.0), # Position at 100%
        text=alt.Text("Count")
    )

    chart_dist = (bars_dist + text_dist).properties(
        title="Distribution of Straight-Lined Values",
        width=800,
        height=500
    )

    analysis_md = f"""
    ### Straight-Lining Analysis
    
    *"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.*
    
    *   **Total Respondents**: {total_respondents}
    *   **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents)
    
    """
    
    return mo.vstack([
        mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
        mo.ui.table(outlier_df),
        mo.md(analysis_md),
        mo.md("#### Speaking Style Question Groups"),
        alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
    ])


if __name__ == "__main__":
    
    from utils import JPMCSurvey
    
    RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
    QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
    
    S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
    data = S.load_data()
    
    print("Checking Green Blue:")
    print(check_straight_liners(S.get_ss_green_blue(data)[0]))
    print("Checking Orange Red:")
    print(check_straight_liners(S.get_ss_orange_red(data)[0]))