From 50f9538dcf44690e6cabdec029d5288be2560534 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Tue, 16 Dec 2025 23:21:03 -0800 Subject: [PATCH] format for consecutive runs --- .gitignore | 3 +- 02-B_Thematic-Processing.py | 110 +++++++++++++++++------------- assets/JP-Morgan-Chase-Symbol.png | Bin 0 -> 13158 bytes 3 files changed, 64 insertions(+), 49 deletions(-) create mode 100644 assets/JP-Morgan-Chase-Symbol.png diff --git a/.gitignore b/.gitignore index 3c1ca91..42d70d7 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ __pycache__/ data/ docker-volumes/ -logs/ \ No newline at end of file +logs/ + diff --git a/02-B_Thematic-Processing.py b/02-B_Thematic-Processing.py index a43cd59..f3411c8 100644 --- a/02-B_Thematic-Processing.py +++ b/02-B_Thematic-Processing.py @@ -22,7 +22,6 @@ def _(): tqdm.pandas() - client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') WORKING_DIR = Path('./data/processing/02-b_WordClouds') @@ -32,14 +31,14 @@ def _(): if not TAGUETTE_EXPORT_DIR.exists(): TAGUETTE_EXPORT_DIR.mkdir(parents=True) - model_select = mo.ui.dropdown( - options=_models, - value=_models[0], - label="Select Ollama Model to use", - searchable=True, + return ( + OLLAMA_LOCATION, + TAGUETTE_EXPORT_DIR, + WORKING_DIR, + connect_qumo_ollama, + mo, + pd, ) - model_select - return TAGUETTE_EXPORT_DIR, WORKING_DIR, client, mo, model_select, pd @app.cell(hide_code=True) @@ -159,8 +158,27 @@ def _(mo): @app.cell -def _(mo, start_processing_btn, tag_select): - mo.stop(not tag_select.value, mo.md("Select tag to continue")) +def _(OLLAMA_LOCATION, connect_qumo_ollama, mo): + try: + client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) + model_select = mo.ui.dropdown( + options=_models, + value=_models[0], + label="Select Ollama Model to use", + searchable=True, + ) + except Exception as e: + mo.md(f"Error connecting to Ollama server at `{OLLAMA_LOCATION}`: {e}") + model_select = None + client = None + + model_select + return client, model_select + + +@app.cell +def _(mo, model_select, start_processing_btn, tag_select): + mo.stop(not tag_select.value or model_select is None, mo.md("Select tag to continue")) start_processing_btn return @@ -172,19 +190,21 @@ def _(client, mo, model_select, pd, start_processing_btn, tags_df): # Wait for start processing button mo.stop(not start_processing_btn.value, "Click button above to start processing") + if client is not None: + df = tags_df + # Run keyword extraction - df = tags_df - # Run keyword extraction - - df['keywords'] = df.progress_apply( - lambda row: pd.Series(ollama_keyword_extraction( - content=row['content'], - tag=row['tag'], - client=client, - model=model_select.value - )), - axis=1 - ) + df['keywords'] = df.progress_apply( + lambda row: pd.Series(ollama_keyword_extraction( + content=row['content'], + tag=row['tag'], + client=client, + model=model_select.value + )), + axis=1 + ) + else: + mo.md("Ollama client not available, See 4b) for loading data from xlsx.") return (df,) @@ -251,7 +271,7 @@ def _(KEYWORD_FREQ_FPATH, mo): @app.cell(hide_code=True) def _(KEYWORD_FREQ_FPATH, freq_df, load_existing_btn, pd): - if load_existing_btn.value: + if load_existing_btn is not None and load_existing_btn.value: _fdf = pd.read_excel(KEYWORD_FREQ_FPATH, engine='openpyxl') # Drop nan rows if any @@ -305,30 +325,6 @@ def _(mo): return (min_freq_select,) -@app.cell(hide_code=True) -def _(freq_df, frequency_df, min_freq_select, mo): - mo.stop('keyword' not in freq_df.columns, "Waiting for keyword extraction to finish") - - MIN_FREQ = min_freq_select.value - - freq_df_filtered = frequency_df.loc[freq_df['frequency'] >= MIN_FREQ] - - freq_df_filtered.reset_index(drop=True, inplace=True) - - keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict() - - table_selection = mo.ui.table(freq_df_filtered, page_size=50) - table_selection - - # keyword_freq_filtered = {kw: freq for kw, freq in keyword_freq.items() if freq >= MIN_FREQ} - - # # create list of keywords sorted by their frequencies. only store the keyword - # sorted_keywords = sorted(keyword_freq_filtered.items(), key=lambda x: x[1], reverse=True) - # sorted_keywords_list = [f"{kw}:{freq}" for kw, freq in sorted_keywords] - # sorted_keywords_list - return (keyword_freq_filtered,) - - @app.cell(hide_code=True) def _(mo, tag_select): mo.md(rf""" @@ -349,6 +345,24 @@ def _(mo, tag_select): return +@app.cell(hide_code=True) +def _(frequency_df, min_freq_select, mo): + mo.stop('keyword' not in frequency_df.columns, "Waiting for keyword extraction to finish") + + MIN_FREQ = min_freq_select.value + + freq_df_filtered = frequency_df.loc[frequency_df['frequency'] >= MIN_FREQ].copy() + + freq_df_filtered.reset_index(drop=True, inplace=True) + + keyword_freq_filtered = freq_df_filtered.set_index('keyword')['frequency'].to_dict() + + table_selection = mo.ui.table(freq_df_filtered, page_size=50) + table_selection + + return (keyword_freq_filtered,) + + @app.cell def _(): IGNORE_WORDS = { @@ -433,7 +447,7 @@ def _( if logo_switch.value: # 1. Load the logo # Make sure this path points to your uploaded file - logo_path = "./data/assets/JP-Morgan-Chase-Symbol.png" + logo_path = "./assets/JP-Morgan-Chase-Symbol.png" logo = Image.open(logo_path).convert("RGBA") # Optional: Resize logo if it's too large or small for the canvas diff --git a/assets/JP-Morgan-Chase-Symbol.png b/assets/JP-Morgan-Chase-Symbol.png new file mode 100644 index 0000000000000000000000000000000000000000..6189df0e1694ef095d38507aa8710eaf535caff0 GIT binary patch literal 13158 zcmd6N2|ShQ+y6Po!8u4|4@rtdoT;o?rkbdb88!ChNTm`^NVYhL7NkjvvP`38%63$; z^H7tKEy)t%X`@h1o2A76x}QVS%=`Pz??0c<|9$_jPwxAwA5#<$iL`(eF2# z&k>q0L?V&qY~1jj4T&@tE@>>bb^h4T)5cC_L%T4{Nf&jrm$qUX#L0 zScrUYxploc%aY_=!TUE+e9!>y^99&!-a@*zsWkrQJtWdX(#G%BYz^uf?lBH@bdZj9 zCs%vxkv9$;5|yLOKPK2R(YAQm==V(8A*mJAC|6g--w&2L7uwc6U%TSuxIyU109|S@ z<3OEKe^1$J(EV?Ju714iM!v1n^GOkNvy6~)l3m>_TsfxveP!j?#^KtTpU; z&x{dvT}Ymz5hM1_Ng+?NW<+)0r~MbjVli@=&4%E53@oN#Kk?~Csr$??aZ^bd>_1N| zqzsuOZPlLkKPeim35%8L$WlAf%%OyGMc&BSI%gWK6O#W$aCnol$t|IVvsXs*Z180#CiP=_lFC3TQi$LF-pBK5l0O6C!z4dUDm!MQ0tQ~Tdv9#TLa z1WucLY_&Scc2QqkRQG0DL-Cxr$meqdqMgZW{hSo%>RW*B67S;zMphCWam$^pPy1vt zF3>w^svoFL3*gQmJu;Qp1_YHd>m=bvLM1zF>K`c-5WXEtS`>+u?G+#7$ zxe(w>YX6wZmT9ELzE0q$=tgZcicAzN-($vj@3T~#TOF->dB;(}~o z*bG~pcrFdi049LaK?Ao=ZpP;$&kb{JfU3HNP5gkX;g_vwkz+wD>^1OcwauzYwi*ya z<>d4BSfh$s(vCv*ScU;zjg^XUJf&($0FLcx*bd}zyhQP4&=9Z>{jP$bKc=c`4IQs7^0Z1Bu)!-3jsrH_k#rU0`U;2X2Mp zu9B6ELg(4r&KZyMw5E8pu`BEjQ&n^KVxiWV+}Kj`^x(@RFG>eed(sMbL~5_moi)LE zXNh8atT%gIK=xfw_+8Nz1yIOpwKario&AM;0C}rKu_NMT55-Ox6xdkde&mrf@;o$y z$jx6-e76q`qDA{%!7vt$=B0|8UuJUzP(=4F=O6Aqbz4UWa**bAp9;x*RFzeYxM`;5 z$%5=zy7M=Pz((HE(jPS4E}REXoaGTz`&LyTV53f?g#ZXUsInT_2SoV{(44?LX^A5$ z7N=r5P)|n(ve?rBEQKRGmxBYvW@OWS3bdW=)XD8b--^{Q1T{y!)DfGyk=aODf#q2Q zq?@WG59VU&&blBbB2l~-IR44<9O$n(D-30E)N2VLtAb?pB83IZ{WWQ(Fz=;GL{MZz zA&65{EyUd<-Pr(iEHt5wA2Anra`w)OK0!}p%b1g$6}N4+G0zSne3+!Tyg$e}h6MnUDl zrAosS94z!*x?tf?uDV;Hvd-F;5}uq6vo0lEYNYMF8YGEI%J5$R@Q{Z;c_8LP5gxB= zFE}rwM5JS(G&dXgezc1cULiOTYtlfS6bKbOY$5_~LWH8yRMWwyNSq4-JfZdYv)fXPcB2?NQImO8^=-S^hyB-6)cOlCSpg29PuD zG&H52#VAwN)#N8PAx#vW)t}vwri3?7Q%&{P^gM9HMhXr@6<<-s$5FGl8?O(|C-7+r-=t914iOZ(56!*Q~Yx>ySX-WfALL3NX=PY zLRJQ(3ck?U++}DJU<&ASqF1gT1AZ#4HbSWJ0dsEjy5By8Npn+QDK5*qOdoFm^C3Hl z_$cGoS4Neby@-I#NA9%~XHCu;OolbWY#F0Ub1eczr4It0s*h**K0_#~#=SSG#bu|U zgvv`sf?bH%oZTq8)nM?9x1v@Eqc_tfpc}@9&hJs+B<3EMQ3O@fw|n0_xo;vg zz73E(X{S`m+6k2uqqO7jV13~OLNtEzbvi?9_kC*E#?|KWK2Q>Uqw-N&5P^uh$!k9n zMNx8vux|gShUeDRuAbEj_9YY)RvN-SlWLjT_|2fe`f*CjOK?qnv`OyZW+pkaCV8#a zvuJic2J?1h+$)H6zxW72${r#~;8T(BO+Z?`-GD0LUH>!0bB^|EV^-{w%ulGrFbHeNn zhib;LKcAfk!V;ecX;+{ugD7Bl#~(AuKz#-sYmr;g778>zGbGe31ikjzi|`2-g+;A7 z{LW-+q10V!xDY^51Z}MfE{nEk9s{gICwc}M41p#_qvmV0Vo|%=-(Trc@(zs*GOAs% zLRNgwYfaoSKJTXq$c@V%UnrzTfYoJy1?oBIT2kot(OF;cIsh(Tfodl&7qRgQwd-tJ zv1*2-Us4P^`^iRSJqO`pEpxrP2EcNlxOk%v15n=~+Lj93GGkQ3I96P`$@LXt$i99N zKB#Zu>ZO8zjDLO7u1mPsq%!+fIg);J;Y2WC8u9l_Z7_&L1rK!@rqtKw-cBI$w8Ynd z!U9|LR$FkBl9%MR&I0y89q`h{FHV&m z49uLo?TShH!{mTT$tx~3y`f3TOBosDddQPI8j{xTL^;b&lGt(SiFyi20o~muHJWDg7rbG>C^~1-z2<#AVzpsktG;uw1Ng2 z3-PFjjE*keTDe;Vr)h6a_I)UFz*Hd;H8kcnY><#UD=|_!3*w zV<3HtAelHe8@hwFfm~-naxM45iRmrWw5KmzC(4r9(as8*5k=B0vA(v)6UuOE_2&|2 zU*S@4`7IQI3hfE$xLRB#5kW;#mQ?C)1N`4Ym1@LNCK zsN766HXadQkHHcnRE6L3)hF_E*%FE_T5gKev=>kOCa%)-e*$*f)PA(2!l zoX`$38{fPwhhBI%bYF#$wldlDECe>^j0h#1FVk5YGR9tQth*wC?Vs?vC=``O3P#C@ z92rc(P$qza1g0Yq6oe(5FGF(FBvO!UzPj{rUhXk=mP-$-<0hr4f1`V!D6*}C>Z<)0 zVsu1^0x8D_jzLfaL-*YQZT>Eg;N-%cL64UccJQY%aO^XXgu~*JH0(&5xjn zl(j{>sM4&wMBda)Oq`^o*Y#$Ym0Ws&+bG57m|9G_fpD>UWp?YC9nhrWiS$Ir4ZKZ4 z>GYjai-ARwVtOHRK*sY?;uyu}xLS;&0rHmD*U+)(m&@yGxU|8rs|dxXC9eMUtzT%x z>+LRkQ-t-;@YGDt3-jwnge3Ts2|^fjgKauv(Ba|tioa)>O0l3PFBb65=t;b0p!W;< z@z>qRkGM~|k^E`(Nj3cy60BQcc9(;lzBIMj2HV|#yZa`3?aNDNcL(l&Y9&wYx!LT~ z1M-|C4LEVqyaJvh0ATpqb+*&MM)zB%iR#xSc!ZY>9tBOXDf9V@`w+x|(&7M!K#7t1 zsz`o?h6DQ@)3Ip4ybV$TMxvn)p{D8_&0hWHNvHgcrIgpV+Y0q|EyrEQb#}xBQeN+* zW~P-P2DAJThx0_+mF=a#2FiI?H{4_{D;v!`@nyqXn z&baMJmxTjF1oA5uj44CP$!^Q#7Xhs2;pqQLV5c_o_x;i!AWdGElfbT~N;3jhNAioH zyb^;ZD6*woh5_e+te+HYNM9G%i2J6F~dE2;LLBC+2UDzB- zUYB>BunBeehwJjH+=a{|h0DO86YKzoJJ5P5ti52J{zYOKd=JX4V6VX_0%T!XssgSw zefE)WlN(g^ud!8C)1NXaVjQ*ypVo63ej0?5tYME)Hj5TdUL})-_4ROVHF3CIKt7YW z2=@8&X-!FJLiGwxmI+Zc@gDYPHDZ6KkTi!K%{R4(kC0k1_$@|hn$!7Cn_PAoXMbLK zn$4f4z+F$HNV9yfDJ4g&NnM{gJ$E^itR^_zcb;Dphh1ql7)4aH)7JnrI5$NYFM$bhz;NhXm*PNnP+VC<-|j`!iA8H0>cCMD1jE$# zi${ZbdG2}(mZll{{2q`cF zNPYcaGsO&E`aO10HJPTdKxY|!{8!LukHMprK`r2eCKH&Eiw^;-{PAplqv*H=u$ipF zBsb$%XSX6PAkE*i5i>ako(jv(=pgQb7}7!rfoh601Kk#f{c!J(tus3Q*h(mw-AX8# z)rz(fN+2N99BV-e34Xqv%1_`jae)Ov#BZb^T)#Q7_zY;^@ zUyFV%hQ$9y^lLF>`D;<0Qn)i~_J5zr|AKcG>R%Z8J2(Hz(0}mqpO~C-|2M(>2R8pz zRQ|@x|La8f8#n)f$$yf1yzKfP&Gvt4lGs1X?%y!=^st{@A3ec9Y6dPy?@t(5Y4k`_m}+nUtVEY&jS89)1*q`Do z`I$-;eXc!W(f|G80@4NJF@ui$NWD(~#v!K&6Ot^G&CkF{d?E~tGIK4=WRnH{a{XU! zAR)9-nkV}&uQhpkh&|}qo-jk=o4sZ^d6SSK-V;uLz}&Bp=v+3R5pSpD66Rr>aDd>< z7RR5Zz_Nk@8hS`yZi<7pCU3MUbOVVmhkp51UM-$lPs zbR#b^F;$MFNGJ7l6jGzbJR<_SYA0^qVTc zfQF#A-&}A8NT>c{D2%%t?wiEGeY9=-*%lxokyxJrYpeUuMDYg11zw@8{FQS-ZtETylbRPI%0vLD+nQORT5>aJgGMwq6tETF0mD{Iby;qUIoKsALqrUp zTq7i4G$8=Y_Uiqb>s!@vW|!L{Ao9chv9ii8(=i__=A%;b&7GukSYRsRS}Jg^lLn|# zv9AWLb`Li=sj#gVgeL62p+gzR3pzSgyc0;H?b&)Q(w&h_Ad;cJT`JB=l zI%vk(28!BlKFuX#;mvtny5%~gPHNsw9p5=5-N+$tQeFtACAZ8+*!cr8k(N~Rk!+n2 zTw$P|qr@6Xm9IVq8G}^Lj&|u)mvR(2bms#qe*5`GZx7|I%iDugzfF^DpELgKsC@NZ z{xI%sp8dx&_6II-c4EgT5||L?5WKA@I?okTWqqFTx*J0NWLzc?!p?Jq8^UuQv_6d*hxSLbD!s{ zXH&rX>7<&g5Why98oRkniB6g=6agCXq_MqJ4H+=UT>bGXxqK`wf%zL{3fIx=JX`5V zD%-y68o6o{Ny(^RIyBL33EK)+^|!I7gKF>o3!Hf~o|s#gG~S&2k=#kujY@{}*Nu`A z>0GkzdIHIo6!i9lQ!XEef>j*u*|S+lg9I);0#_j1#I`RZ67(^ov_diHA$h7#>71y? z?8F88>LDHhSTCzJS3n?(44BH7N4K!8Gg#dQ>J2j)P(F4@zL~a}ScwVnjfMuAF%F@8 zpI$eQm=h%LpeCXggL1ZhqdRnocgf;u!o_1@JE*nLPj*lR`E(;cczTJfgVg+ZXQMl+ zdZq4tDfG_Pw%SRZ9jdU7_<*@D28KVbR?(lwt*&pgd$gAnF)`h}B1($fX1O^RUW$itUaX_Eb9uyRHGw^cx%N!$4pf#R_}2m#c#ioPtz zLO`w>rIYQn*R5^RplG^zWN`Z$LBj6Tx!(yuwSHLZw5VPX zv_poXH3DK1d^tp2r8o@m#^X#Zw0Wx;OB0{#dLrr0`xxV}~>Y_6t-MLvP4!lQx4X3z4U_ig;vwU?6{;>D@9|AWES?~{g52l$h;;7IJ zJ=-E37UM~m)n{$Xa(i1(i7TMCF0LSgqS(;BU7FTOZ7=!BVIk-mxViqiJ|GJS9J!|{ z3JoDWJ*nm`$vWd=lMnOyDy+f|bis*|BRAoaejlh?5fu$6dE?nFf}TG>o;Kdy=LIt2I>1Ttu3{axDeKb-LIe(WHMXN~=u>dn;@1l1*=+K=?f zcy*&Meq9o1?;p!(wgFpbGm~nz&CE7X&?k@&-&ASs7TzTXY4(E|3;Mnj&%gE^35QNY zVYctsKQkMsYf?aaJw%1doQpOC%?NmA`8v63ILCU=Yt3nSP7aMJEmq93KoP9(W|Xdi zU8P15RwXEuXNu-8bO91I9z6F}c0oLN`PmesAn-zAw}Yw;Wk~(12t2}F&Lq{WCtJJz zoac2rU=8?_mvbw?0L_gXA4dUSs7iR_+>A_=Rj!$NK8-JJ5QRQsmo3O9!C|G9ioXhz zO+vy-O#q7|UQv5_D3r8c4qE>HBmo;4GNgmvZi~T1U-Ug%T$O#D_5~U$;<{U{-hh{i z_wp`C$TtU-Y&S)S^rM{PZTX@znF0BcHj^deiGi=(Y74U2PxVJ7+waE6J8r+xcT71I7!>G@Mn9mE>79vt~c@1J^mw~)89?( zN?xYEVk}JOU3W>XDnrrnNZ|m0HGJm3|D)*mvwJPd1)~Za74tLYRu6qKljh@_OKduQ zF`pIp)a`^m&FQ3Ga&jsemE$%Yy+u1dr*lVmJp7^9iuv_edh@=QFQj8YQ*zA|B`_(n z2~@aY=Cuh;ulT#|F!{@KRP=S3XC}UhiiQTs8`a%&JHw&E)ZNc_x}v#eVDP;Jhi)m? zduezHQ%`QGhx$Vkii!Si$;X@AwaKe@oq}13Tvo>xhriy^<6-318Qh{Z%?TPv`{78D zI_e->np|&O1RLs}OSN~vfA8z3f{9)Gv7L^Yr+=`Uj9;E;U^3=*BYJdR}W{n2UQS2{xO#CyYb@I z+R`OJ8d(MY$bDww<$aFJ<@I7_rf2QKesrNHCwqGdigqQI_dR1E%Q7ZNDERWmF)0E0Fi);Ii$u?MG z2mWwtU6;^!w?t4r=FoGOWkv+Vb++S+L1NzRSauPR177A&kE+3 z7=`(T%XUVyedh)ZOfsppkPR!#3)#L<6FY-elKbX!=nY&~`gl7iH)Cqx-AL&-_Cn(~ z^JCA6$ii2nl+qQ=n2X>+ul{P;>H;v1?w-TqZ#iQ{;QC6@E#$s=pk~7`f7n5w(4$}; z_rpwzbF=)T#o;5A>XK?A$2%F)!elpj*d0_pk6Q?FXg)$4?_yhv&UsoO$xU{nOQeXW z(#J~y&YkUx@FB=rx#V1SKD>9{fy57b`a5!BSd|Y2UF#jn!mt3Q?3rYCJ{(0mxA#*j zjez@tRH5;=$R$s0Jnl=mqN43J&_Z5R`8&#H7P7UW3f$epq$!mQJo;A6xFt2r9l4-T z?L>By3a``8X%nyBYjy~=HUF?U@ zCAg=t;iC91Td-6DdGfIeN^B97m|V$GcIM-MRRLLgQk-)l)NWzW=Lgx*bmt}WeLuzI z*1zg{^NfHs)f8YV3cekx%2(1$8&?-pi>67CO@FIic-t1sd>0(Z3;2Z;`nbQ^00lXj z?jI?xAP=5BgYVfk+~-!kg$>vpj)1f0b_o~$ZRL! zRWF2AwZjsU_Jr3Wd^$8>0VB$v&qQDJf`ibw#AWbTk0~L`Yb^8|GF(Bm!;wZBWYaO< zn&L)iOG#eKi09B>&@Mb73wo;nAV$X7*tFBNC`bXYTWZS9;aMl${q8q7j zBOX2vJaC*`aI;ZxvPl_3EwP0+;bJGey5QxFQ8$_(2<@iDmN)e$2!{Dl5lnOt@J4hf z(8ultKpL=FR)XbkCh0{!s97D=^CG6PtB=9hTkmctJ=MNz4++cY6k5jEtviD~$ukjU zhPm~e{se*Hcf-Hd+-QOCSBG5X_^yPkHq7aQFDBP{pbFrHo>u|T;p5iFK}|k*avr@7 z47MrKiicBP>bzBrqqINNGjm|C+ukq84;-tDib0Oy0l4RrDyT+js>acONg zd=oG7%DrpJaAf^#cHI{BmHAeV^zj!C9_YAv^XoTJwXcV#fB91xSY8~pf}XPF%Go^0 zsZS`Y3eOgaK`dr;MaK03WhG=}WxOuYTE#oCxTXj|WXDr2$L)n}W|dwp7U^|(gNh|z|w zt+mN(m5h&yJ3q?42W_|N2)`MN8)whE{1E!Y!W!g72T;3P){%~UD zWLu`m