102 lines
3.4 KiB
Python
102 lines
3.4 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
# source_distinction.py - I think vs I know annotation system.
|
||
|
|
# SOUL.md: "Every claim I make comes from one of two places: a verified source
|
||
|
|
# I can point to, or my own pattern-matching."
|
||
|
|
# Part of #793
|
||
|
|
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from enum import Enum
|
||
|
|
from typing import List, Optional
|
||
|
|
|
||
|
|
|
||
|
|
class SourceType(Enum):
|
||
|
|
VERIFIED = "verified"
|
||
|
|
INFERRED = "inferred"
|
||
|
|
STATED = "stated"
|
||
|
|
UNKNOWN = "unknown"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class Claim:
|
||
|
|
text: str
|
||
|
|
source_type: SourceType
|
||
|
|
source_ref: str = ""
|
||
|
|
confidence: float = 0.0
|
||
|
|
hedging: str = ""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class AnnotatedResponse:
|
||
|
|
raw_text: str
|
||
|
|
claims: List[Claim] = field(default_factory=list)
|
||
|
|
|
||
|
|
def render(self):
|
||
|
|
if not self.claims:
|
||
|
|
return self.raw_text
|
||
|
|
parts = []
|
||
|
|
for claim in self.claims:
|
||
|
|
if claim.source_type == SourceType.VERIFIED:
|
||
|
|
prefix = "[verified: " + claim.source_ref + "]" if claim.source_ref else "[verified]"
|
||
|
|
parts.append(claim.text + " " + prefix)
|
||
|
|
elif claim.source_type == SourceType.INFERRED:
|
||
|
|
hedge = claim.hedging or "I think"
|
||
|
|
parts.append(hedge + " " + claim.text)
|
||
|
|
elif claim.source_type == SourceType.STATED:
|
||
|
|
parts.append(claim.text + " [you told me]")
|
||
|
|
else:
|
||
|
|
parts.append("I am not certain, but " + claim.text)
|
||
|
|
return " ".join(parts)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def verified_count(self):
|
||
|
|
return sum(1 for c in self.claims if c.source_type == SourceType.VERIFIED)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def inferred_count(self):
|
||
|
|
return sum(1 for c in self.claims if c.source_type == SourceType.INFERRED)
|
||
|
|
|
||
|
|
|
||
|
|
def verified(text, source, confidence=0.95):
|
||
|
|
return Claim(text=text, source_type=SourceType.VERIFIED, source_ref=source, confidence=confidence)
|
||
|
|
|
||
|
|
def inferred(text, hedging="I think", confidence=0.6):
|
||
|
|
return Claim(text=text, source_type=SourceType.INFERRED, confidence=confidence, hedging=hedging)
|
||
|
|
|
||
|
|
def stated(text):
|
||
|
|
return Claim(text=text, source_type=SourceType.STATED, confidence=1.0)
|
||
|
|
|
||
|
|
|
||
|
|
def annotate_response(raw_text, claims):
|
||
|
|
return AnnotatedResponse(raw_text=raw_text, claims=claims)
|
||
|
|
|
||
|
|
|
||
|
|
def format_for_display(response):
|
||
|
|
lines = []
|
||
|
|
for claim in response.claims:
|
||
|
|
if claim.source_type == SourceType.VERIFIED:
|
||
|
|
ref = " (" + claim.source_ref + ")" if claim.source_ref else ""
|
||
|
|
lines.append(" = " + claim.text + ref)
|
||
|
|
elif claim.source_type == SourceType.INFERRED:
|
||
|
|
lines.append(" ~ " + claim.hedging + " " + claim.text)
|
||
|
|
elif claim.source_type == SourceType.STATED:
|
||
|
|
lines.append(" > " + claim.text)
|
||
|
|
else:
|
||
|
|
lines.append(" ? " + claim.text)
|
||
|
|
if response.claims:
|
||
|
|
v = response.verified_count
|
||
|
|
i = response.inferred_count
|
||
|
|
t = len(response.claims)
|
||
|
|
lines.append("")
|
||
|
|
lines.append(" [" + str(v) + " verified, " + str(i) + " inferred, " + str(t) + " total]")
|
||
|
|
return "\n".join(lines)
|
||
|
|
|
||
|
|
|
||
|
|
def source_distinction_check(text):
|
||
|
|
hedging_words = ["i think", "i believe", "probably", "likely", "might",
|
||
|
|
"it seems", "perhaps", "i am not sure", "i guess",
|
||
|
|
"my understanding is", "i suspect"]
|
||
|
|
text_lower = text.lower()
|
||
|
|
hedging_count = sum(1 for h in hedging_words if h in text_lower)
|
||
|
|
return {"has_hedging": hedging_count > 0, "hedging_count": hedging_count,
|
||
|
|
"likely_inferred": hedging_count > 2}
|