basic parsing working
This commit is contained in:
65
utils/data_utils.py
Normal file
65
utils/data_utils.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print("No sentiment data found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print("No Voice/Character context found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# # Convert to integers for cleaner display
|
||||
# matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
|
||||
|
||||
|
||||
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
|
||||
"""
|
||||
Extract the theme from a tag string.
|
||||
|
||||
Parameters:
|
||||
- tag: str, the tag string (e.g., 'VT - Personal Experience')
|
||||
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
|
||||
|
||||
Returns:
|
||||
- str, the extracted theme (e.g., 'Personal Experience')
|
||||
- None if no theme found
|
||||
"""
|
||||
for prefix in theme_prefixes.split('|'):
|
||||
if tag.startswith(prefix):
|
||||
return tag.replace(prefix, '').strip()
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user