diff --git a/Taguette-Preprocess.py b/Taguette-Preprocess.py index bda2dff..6fcea98 100644 --- a/Taguette-Preprocess.py +++ b/Taguette-Preprocess.py @@ -34,40 +34,44 @@ def _(INPUT_DIR, mo): return (file_dropdown,) +@app.function(hide_code=True) +def csv_to_markdown(df): + """Convert transcript DataFrame to markdown, merging consecutive same-speaker turns.""" + lines = ["# Interview Transcript"] + + # Track previous speaker to detect when speaker changes + prev_speaker = None + # Accumulate text from consecutive turns by same speaker + merged_text = [] + + for _, row in df.iterrows(): + speaker = row["Speaker"] + text = str(row["Transcript"]).strip() + + if speaker == prev_speaker: + # Same speaker continues — append text to current block + merged_text.append(text) + else: + # New speaker detected — flush previous speaker's block + if prev_speaker is not None: + # Format: **Speaker**: text-part-1\n\ntext-part-2 + # Use \n\n to ensure distinct paragraphs for readability + lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}") + + # Start new block for current speaker + prev_speaker = speaker + merged_text = [text] + + # Flush final speaker's block + if prev_speaker is not None: + lines.append(f"**{prev_speaker}**: {'\n\n'.join(merged_text)}") + + # Join all blocks with double newlines for clear separation + return "\n\n".join(lines) + + @app.cell def _(file_dropdown, mo, pd): - def csv_to_markdown(df): - """Convert transcript DataFrame to markdown, merging consecutive same-speaker turns.""" - lines = [f"# Interview Transcript\n"] - - # Track previous speaker to detect when speaker changes - prev_speaker = None - # Accumulate text from consecutive turns by same speaker - merged_text = [] - - for _, row in df.iterrows(): - speaker = row["Speaker"] - text = str(row["Transcript"]).strip() - - if speaker == prev_speaker: - # Same speaker continues — append text to current block - merged_text.append(text) - else: - # New speaker detected — flush previous speaker's block - if prev_speaker is not None: - # Format: **Speaker**: text-part-1\ntext-part-2 + blank line - lines.append(f"**{prev_speaker}**: {'\n'.join(merged_text)}\n\n") - - # Start new block for current speaker - prev_speaker = speaker - merged_text = [text] - - # Flush final speaker's block - if prev_speaker is not None: - lines.append(f"**{prev_speaker}**: {'\n'.join(merged_text)}\n\n") - - return "\n".join(lines) - # Preview preview = mo.md("") if file_dropdown.value: @@ -76,7 +80,7 @@ def _(file_dropdown, mo, pd): preview = mo.md(md_content) preview - return (csv_to_markdown,) + return @app.cell @@ -87,17 +91,30 @@ def _(mo): @app.cell -def _(OUTPUT_DIR, Path, convert_btn, csv_to_markdown, file_dropdown, mo, pd): +def _(OUTPUT_DIR, Path, convert_btn, file_dropdown, mo, pd): result = mo.md("") + saved_md = None if convert_btn.value and file_dropdown.value: _df = pd.read_csv(file_dropdown.value) - _md = csv_to_markdown(_df) + saved_md = csv_to_markdown(_df) _out_path = OUTPUT_DIR / (Path(file_dropdown.value).stem + ".md") - _out_path.write_text(_md) + _out_path.write_text(saved_md) result = mo.callout(f"✅ Saved to `{_out_path}`", kind="success") result + return (saved_md,) + + +@app.cell +def _(mo, saved_md): + saved_preview = mo.md("") + if saved_md: + saved_preview = mo.vstack([ + mo.md("### Saved Markdown Preview"), + mo.md(saved_md) + ]) + saved_preview return