sentiments saving to intermediate csv
This commit is contained in:
@@ -9,6 +9,7 @@ def _():
|
|||||||
import marimo as mo
|
import marimo as mo
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
|
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
|
||||||
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
|
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
|
||||||
@@ -17,7 +18,7 @@ def _():
|
|||||||
WORKING_DIR.mkdir(parents=True)
|
WORKING_DIR.mkdir(parents=True)
|
||||||
if not TAGUETTE_EXPORT_DIR.exists():
|
if not TAGUETTE_EXPORT_DIR.exists():
|
||||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||||
return TAGUETTE_EXPORT_DIR, mo, pd
|
return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
@@ -61,19 +62,19 @@ def _(mo):
|
|||||||
@app.cell
|
@app.cell
|
||||||
def _(all_tags_df, mo):
|
def _(all_tags_df, mo):
|
||||||
|
|
||||||
file_dropdown = mo.ui.dropdown(
|
interview_select = mo.ui.dropdown(
|
||||||
options=all_tags_df['document'].unique().tolist(),
|
options=all_tags_df['document'].unique().tolist(),
|
||||||
label="Select Interview to Process",
|
label="Select Interview to Process",
|
||||||
full_width=True
|
full_width=True
|
||||||
)
|
)
|
||||||
file_dropdown
|
interview_select
|
||||||
return (file_dropdown,)
|
return (interview_select,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(all_tags_df, file_dropdown):
|
def _(all_tags_df, interview_select):
|
||||||
# filter all_tags_df to only the document = file_dropdown.value
|
# filter all_tags_df to only the document = file_dropdown.value
|
||||||
df = all_tags_df.loc[all_tags_df['document'] == file_dropdown.value].copy()
|
df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
|
||||||
return (df,)
|
return (df,)
|
||||||
|
|
||||||
|
|
||||||
@@ -139,7 +140,6 @@ def _(df):
|
|||||||
# Assign the context to all rows in this highlight
|
# Assign the context to all rows in this highlight
|
||||||
df.loc[df['id'] == highlight_id, '_context'] = context_tag
|
df.loc[df['id'] == highlight_id, '_context'] = context_tag
|
||||||
|
|
||||||
del idx
|
|
||||||
df
|
df
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -188,7 +188,12 @@ def _(df, pd):
|
|||||||
|
|
||||||
expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
|
expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
|
||||||
|
|
||||||
manual_rows = expanded_df_raw[expanded_df_raw['manual_analysis']]
|
|
||||||
|
sentiment_df = expanded_df_raw.loc[
|
||||||
|
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
|
||||||
|
].copy()
|
||||||
|
|
||||||
|
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
|
||||||
if not manual_rows.empty:
|
if not manual_rows.empty:
|
||||||
print(
|
print(
|
||||||
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
|
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
|
||||||
@@ -196,15 +201,14 @@ def _(df, pd):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print("✓ No multi-context rows found")
|
print("✓ No multi-context rows found")
|
||||||
return (expanded_df_raw,)
|
return (sentiment_df,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(expanded_df_raw, mo):
|
def _(mo, sentiment_df):
|
||||||
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
|
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
|
||||||
rows_to_edit = expanded_df_raw[
|
rows_to_edit = sentiment_df[
|
||||||
(expanded_df_raw['manual_analysis'])
|
(sentiment_df['manual_analysis'])
|
||||||
& (expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False))
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Create data editor for split rows
|
# Create data editor for split rows
|
||||||
@@ -232,43 +236,22 @@ def _(mo, rows_to_edit, split_rows_editor):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(expanded_df_raw, mo, pd, split_rows_editor):
|
def _(mo, split_rows_editor):
|
||||||
# Reconstruct the full dataframe using the editor's current value
|
# Capture the edited manual-analysis rows for validation
|
||||||
# This will update whenever the user edits the table
|
|
||||||
mo.stop(split_rows_editor.value is None, mo.md("Submit your changes."))
|
mo.stop(split_rows_editor.value is None, mo.md("Submit your changes."))
|
||||||
_edited_rows = split_rows_editor.value
|
reviewed_manual_rows = split_rows_editor.value
|
||||||
_static_rows = expanded_df_raw[~expanded_df_raw['manual_analysis']]
|
|
||||||
expanded_df2 = pd.concat([_static_rows, _edited_rows]).sort_index()
|
|
||||||
return (expanded_df2,)
|
|
||||||
|
|
||||||
|
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
|
||||||
|
if not reviewed_manual_rows.empty:
|
||||||
|
valid_sentiments = {-1, 0, 1}
|
||||||
|
needs_review = reviewed_manual_rows[
|
||||||
|
reviewed_manual_rows['manual_analysis']
|
||||||
|
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
|
||||||
|
]
|
||||||
|
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
|
||||||
|
|
||||||
@app.cell
|
print("✓ Manual-analysis rows have valid sentiment values")
|
||||||
def _(expanded_df2, pd):
|
return (reviewed_manual_rows,)
|
||||||
# Verify no rows have multiple contexts
|
|
||||||
try:
|
|
||||||
has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False)
|
|
||||||
assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)"
|
|
||||||
|
|
||||||
# Verify that rows still marked for manual analysis have sentiment values
|
|
||||||
manual_sent_rows = expanded_df2[expanded_df2['manual_analysis']]
|
|
||||||
theme_rows = manual_sent_rows[manual_sent_rows['tag'].str.startswith(('VT -', 'CT -'), na=False)]
|
|
||||||
missing_sentiment = theme_rows[theme_rows['sentiment'].isna()]
|
|
||||||
|
|
||||||
assert missing_sentiment.empty, (
|
|
||||||
f"{len(missing_sentiment)} rows marked for manual analysis "
|
|
||||||
"have missing sentiment values"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✓ Verification passed: Manual-analysis rows are consistent")
|
|
||||||
|
|
||||||
expanded_df_final = expanded_df2
|
|
||||||
|
|
||||||
expanded_df_final
|
|
||||||
|
|
||||||
except AssertionError as e:
|
|
||||||
print(f"\n❌ Verification failed: {e}")
|
|
||||||
print("Please review the data before proceeding")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
@@ -287,7 +270,7 @@ def _(mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df):
|
def _(sentiment_df):
|
||||||
# TODO: Implement sentiment analysis and add 'sentiment' column
|
# TODO: Implement sentiment analysis and add 'sentiment' column
|
||||||
|
|
||||||
# for now, create an empty sentiment column with randomized dummy values for testing
|
# for now, create an empty sentiment column with randomized dummy values for testing
|
||||||
@@ -299,12 +282,31 @@ def _(df):
|
|||||||
return random.choice([-1, 0, 1]) # Random sentiment for testing
|
return random.choice([-1, 0, 1]) # Random sentiment for testing
|
||||||
return None
|
return None
|
||||||
|
|
||||||
df['sentiment'] = df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
|
# Only run on rows without manual_analysis
|
||||||
|
|
||||||
df
|
sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
|
||||||
|
|
||||||
|
sentiment_df[~sentiment_df['manual_analysis']]
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
## Recombine
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd, reviewed_manual_rows, sentiment_df):
|
||||||
|
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
|
||||||
|
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
|
||||||
|
|
||||||
|
recombined_df
|
||||||
|
return (recombined_df,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(mo):
|
def _(mo):
|
||||||
mo.md(r"""
|
mo.md(r"""
|
||||||
@@ -328,5 +330,22 @@ def _():
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
# Save to CSV
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(WORKING_DIR, datetime, interview_select, recombined_df):
|
||||||
|
# Save to CSV in working dir
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
|
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
|
||||||
|
recombined_df.to_csv(filename, index=False)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run()
|
app.run()
|
||||||
|
|||||||
Reference in New Issue
Block a user