diff --git a/01_Taguette-Pre-Process.py b/01_Taguette-Pre-Process.py index ba4dcdb..989a0f4 100644 --- a/01_Taguette-Pre-Process.py +++ b/01_Taguette-Pre-Process.py @@ -55,7 +55,7 @@ def _(file_dropdown, jpmc_transcript_to_md, mo): preview = mo.md("") if file_dropdown.value: md_content = jpmc_transcript_to_md(file_dropdown.value) - preview = mo.md(md_content) + preview = mo.md(md_content[:1000]) preview return diff --git a/utils/transcript_utils.py b/utils/transcript_utils.py index 3ac8bf7..6b7e82b 100644 --- a/utils/transcript_utils.py +++ b/utils/transcript_utils.py @@ -106,6 +106,12 @@ def cpc_smb_to_markdown(cpc_path: Path) -> str: for line in content.splitlines(): line = line.strip().replace('\n', ' ') + + # Handle edge case: "CPC1, (She/ Her,) LOCATION: Hello." -> "CPC1: Hello." + match = re.match(r'^"?([A-Za-z0-9]+),\s*\(.*?\)\s*LOCATION:\s*(.*?)"?$', line) + if match: + line = f"{match.group(1)}: {match.group(2)}" + # Remove surrounding quotes if line.startswith('"') and line.endswith('"'): line = line[1:-1].strip()