basic parsing working

This commit is contained in:
2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions

View File

@@ -9,14 +9,14 @@ def _():
import marimo as mo
import pandas as pd
from pathlib import Path
from utils import create_sentiment_matrix
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
return INPUT_DIR, Path, WORKING_DIR, mo, pd
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
@app.cell(hide_code=True)
@@ -62,55 +62,6 @@ def _(mo):
return
@app.cell
def _(document_name, pd):
import numpy as np
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print(f"No sentiment data found for document: {document_name}")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print(f"No Voice/Character context found for document: {document_name}")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
return (create_sentiment_matrix,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""