import json, re, datetime, collections, statistics

SRC="/home/nk/hobo-fable/docs-arch/_persona/export/conversations.json"
OUT="/home/nk/hobo-godmode/persona-analysis"

with open(SRC) as f:
    convos=json.load(f)

# sort conversations by created_at
def ts(s):
    try: return datetime.datetime.fromisoformat(s.replace("Z","+00:00"))
    except: return datetime.datetime(1970,1,1,tzinfo=datetime.timezone.utc)

convos.sort(key=lambda c: ts(c.get("created_at","")))

def clean(t):
    if t is None: return ""
    t=t.replace("\r"," ")
    return t

# ---------- A: human-only, one line per message ----------
human_lines=[]
# ---------- B: condensed conversation flow ----------
flow=[]
all_human=[]
hour_hist=collections.Counter()
dow_hist=collections.Counter()
month_hist=collections.Counter()
hlens=[]
for ci,c in enumerate(convos):
    name=c.get("name","") or "(untitled)"
    msgs=c.get("chat_messages",[]) or []
    msgs.sort(key=lambda m: ts(m.get("created_at","")))
    flow.append(f"\n\n===== CONV {ci+1}/{len(convos)} | {c.get('created_at','')[:10]} | {name} =====")
    for m in msgs:
        sender=m.get("sender")
        text=clean(m.get("text",""))
        t=ts(m.get("created_at",""))
        # convert to Vienna time (CEST=UTC+2 for these months, CET+1 for Mar)
        local=t+datetime.timedelta(hours=2)
        if sender=="human":
            oneline=re.sub(r"\s+"," ",text).strip()
            human_lines.append(f"[{c.get('created_at','')[:10]} {local.strftime('%H:%M')}|{name[:40]}] {oneline}")
            all_human.append(text)
            hlens.append(len(text))
            hour_hist[local.hour]+=1
            dow_hist[local.strftime('%a')]+=1
            month_hist[c.get('created_at','')[:7]]+=1
            flow.append(f"\n[NEMANJA {local.strftime('%H:%M')}]: {oneline}")
        else:
            tr=re.sub(r"\s+"," ",text).strip()
            if len(tr)>240: tr=tr[:240]+" …"
            flow.append(f"  [claude]: {tr}")

with open(f"{OUT}/human_messages.txt","w") as f:
    f.write("\n".join(human_lines))

# split flow into 2 parts to keep each readable
flowtext="".join(flow)
half=len(flow)//2
with open(f"{OUT}/flow_part1.md","w") as f:
    f.write("".join(flow[:half]))
with open(f"{OUT}/flow_part2.md","w") as f:
    f.write("".join(flow[half:]))

# ---------- C: quantitative stats ----------
fulltext=" ".join(all_human).lower()
words=re.findall(r"[a-zäöüß]+",fulltext)
wc=collections.Counter(words)
# rough language: count typical german vs english stopwords
de_markers=["der","die","das","und","ich","nicht","mach","kannst","wie","ein","auf","mir","für","ist","mit","schau","bitte"]
en_markers=["the","and","you","can","what","with","this","for","make","please","just","how"]
de=sum(wc[w] for w in de_markers); en=sum(wc[w] for w in en_markers)

short=sum(1 for l in hlens if l<=40)
empty=sum(1 for l in hlens if l==0)
questions=sum(1 for t in all_human if "?" in t)

lines=[]
lines.append("# QUANTITATIVE STATS — Nemanja claude.ai corpus\n")
lines.append(f"Conversations: {len(convos)}  | Human messages: {len(hlens)}  | Total chars (his): {sum(hlens)}")
lines.append(f"Msg length: min={min(hlens)} median={int(statistics.median(hlens))} mean={int(statistics.mean(hlens))} p90={sorted(hlens)[int(len(hlens)*0.9)]} max={max(hlens)}")
lines.append(f"Short msgs (<=40 chars): {short} ({100*short//len(hlens)}%)  | Empty(text-only attach): {empty}  | Msgs containing '?': {questions} ({100*questions//len(hlens)}%)")
lines.append(f"Language markers — DE hits: {de}  EN hits: {en}  (ratio DE:EN ~ {de/max(en,1):.1f}:1)")
lines.append("")
lines.append("## Activity by hour (Vienna local, CEST):")
for h in range(24):
    bar="#"*hour_hist.get(h,0)
    lines.append(f"  {h:02d}:00  {hour_hist.get(h,0):3d} {bar}")
lines.append("")
lines.append("## Activity by weekday:")
for d in ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]:
    lines.append(f"  {d}: {dow_hist.get(d,0)}")
lines.append("")
lines.append("## Messages by month:")
for m in sorted(month_hist): lines.append(f"  {m}: {month_hist[m]}")
lines.append("")
lines.append("## Top 60 content words (>=4 chars):")
common=[(w,n) for w,n in wc.most_common(400) if len(w)>=4][:60]
lines.append("  "+", ".join(f"{w}({n})" for w,n in common))
lines.append("")
# politeness / filler markers he uses
for marker in ["einfach","nochmal","bitte","danke","perfekt","super","passt","weiter","mach","schau","kannst","brauche","will","mega","geil","scheisse","scheiße","fuck","ok","okay"]:
    if wc.get(marker): lines.append(f"  '{marker}': {wc[marker]}")

with open(f"{OUT}/stats.md","w") as f:
    f.write("\n".join(lines))

print("DONE")
print(f"human_messages.txt lines: {len(human_lines)}")
print(f"flow parts written, total flow lines: {len(flow)}")
import os
for fn in ["human_messages.txt","flow_part1.md","flow_part2.md","stats.md"]:
    print(fn, os.path.getsize(f'{OUT}/{fn}'), "bytes")