basic parsing working
This commit is contained in:
@@ -9,14 +9,14 @@ def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from utils import create_sentiment_matrix
|
||||
|
||||
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
|
||||
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
|
||||
return INPUT_DIR, Path, WORKING_DIR, mo, pd
|
||||
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -62,55 +62,6 @@ def _(mo):
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(document_name, pd):
|
||||
import numpy as np
|
||||
|
||||
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print(f"No sentiment data found for document: {document_name}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print(f"No Voice/Character context found for document: {document_name}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# # Convert to integers for cleaner display
|
||||
# matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
return (create_sentiment_matrix,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
|
||||
Reference in New Issue
Block a user