Interview-Analysis/utils/data_utils.py

import pandas as pd


def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
    """
    Create a sentiment matrix for a specific document.

    Parameters:
    - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
    - document_name: Name of the document to filter by

    Returns:
    - DataFrame representing the sentiment matrix
    """

    # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
    sentiment_rows = doc_df[
        doc_df['tag'].str.contains(column_prefix, na=False)
    ].copy()

    if sentiment_rows.empty:
        print("No sentiment data found")
        return pd.DataFrame()

    # Filter for rows with valid Voice/Character context
    valid_rows = sentiment_rows[
        sentiment_rows['_context'].notna() &
        (sentiment_rows['_context'].str.contains(row_prefix, na=False))
    ].copy()

    if valid_rows.empty:
        print("No Voice/Character context found")
        return pd.DataFrame()

    # Create aggregation: group by Voice/Character (_context) and Theme (tag)
    # Sum sentiment scores for each combination
    matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()

    # Pivot to create the matrix
    matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')

    # # Convert to integers for cleaner display
    # matrix = matrix.astype(int)

    return matrix


def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
    """
    Extract the theme from a tag string.

    Parameters:
    - tag: str, the tag string (e.g., 'VT - Personal Experience')
    - theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')

    Returns:
    - str, the extracted theme (e.g., 'Personal Experience')
    - None if no theme found
    """
    for prefix in theme_prefixes.split('|'):
        if tag.startswith(prefix):
            return tag.replace(prefix, '').strip()
    return None