taguette pre-process
This commit is contained in:
120
Taguette-Preprocess.py
Normal file
120
Taguette-Preprocess.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.0"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
return Path, mo, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path):
|
||||
INPUT_DIR = Path("data/transcripts/raw")
|
||||
OUTPUT_DIR = Path("data/transcripts/clean")
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
return INPUT_DIR, OUTPUT_DIR
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(INPUT_DIR, mo):
|
||||
csv_files = list(INPUT_DIR.glob("*.csv"))
|
||||
file_options = {f.stem: str(f) for f in csv_files}
|
||||
|
||||
file_dropdown = mo.ui.dropdown(
|
||||
options=file_options,
|
||||
label="Select CSV Transcript",
|
||||
full_width=True
|
||||
)
|
||||
file_dropdown
|
||||
return (file_dropdown,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(file_dropdown, mo, pd):
|
||||
def csv_to_markdown(df):
|
||||
"""Convert transcript DataFrame to markdown, merging consecutive same-speaker turns."""
|
||||
lines = [f"# Interview Transcript\n"]
|
||||
|
||||
# Track previous speaker to detect when speaker changes
|
||||
prev_speaker = None
|
||||
# Accumulate text from consecutive turns by same speaker
|
||||
merged_text = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
speaker = row["Speaker"]
|
||||
text = str(row["Transcript"]).strip()
|
||||
|
||||
if speaker == prev_speaker:
|
||||
# Same speaker continues — append text to current block
|
||||
merged_text.append(text)
|
||||
else:
|
||||
# New speaker detected — flush previous speaker's block
|
||||
if prev_speaker is not None:
|
||||
# Format: **Speaker**: text-part-1\ntext-part-2 + blank line
|
||||
lines.append(f"**{prev_speaker}**: {'\n'.join(merged_text)}\n\n")
|
||||
|
||||
# Start new block for current speaker
|
||||
prev_speaker = speaker
|
||||
merged_text = [text]
|
||||
|
||||
# Flush final speaker's block
|
||||
if prev_speaker is not None:
|
||||
lines.append(f"**{prev_speaker}**: {'\n'.join(merged_text)}\n\n")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
# Preview
|
||||
preview = mo.md("")
|
||||
if file_dropdown.value:
|
||||
df = pd.read_csv(file_dropdown.value)
|
||||
md_content = csv_to_markdown(df)
|
||||
preview = mo.md(md_content)
|
||||
|
||||
preview
|
||||
return (csv_to_markdown,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
convert_btn = mo.ui.run_button(label="Convert to Markdown")
|
||||
convert_btn
|
||||
return (convert_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(OUTPUT_DIR, Path, convert_btn, csv_to_markdown, file_dropdown, mo, pd):
|
||||
result = mo.md("")
|
||||
|
||||
if convert_btn.value and file_dropdown.value:
|
||||
_df = pd.read_csv(file_dropdown.value)
|
||||
_md = csv_to_markdown(_df)
|
||||
_out_path = OUTPUT_DIR / (Path(file_dropdown.value).stem + ".md")
|
||||
_out_path.write_text(_md)
|
||||
result = mo.callout(f"✅ Saved to `{_out_path}`", kind="success")
|
||||
|
||||
result
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Taguette
|
||||
|
||||
Upload and process using taguette: http://taguette.tail44fa00.ts.net/
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user