import re import time import hashlib def chunk_text(text, max_chars=500): paragraphs = re.split(r'\n\s*\n', text.strip()) chunks = [] for para in paragraphs: para = para.strip() if not para: continue if len(para) <= max_chars: chunks.append(para) else: sentences = re.split(r'(?<=[.!?])\s+', para) current = "" for sent in sentences: if len(current) + len(sent) + 1 <= max_chars: current += " " + sent if current else sent else: if current: chunks.append(current) current = sent if current: chunks.append(current) return chunks def deduplicate(examples): seen = set() unique = [] for ex in examples: text = ex.get("text", "").strip().lower() h = hashlib.md5(text.encode()).hexdigest() if h not in seen: seen.add(h) unique.append(ex) return unique def build_source_map(examples): sources = {} for ex in examples: src = ex.get("source", "unknown") if src not in sources: sources[src] = {"count": 0, "domains": set()} sources[src]["count"] += 1 if "domain" in ex: sources[src]["domains"].add(ex["domain"]) for s in sources.values(): s["domains"] = list(s["domains"]) return sources def process_training_batch(examples, max_chars=500): examples = deduplicate(examples) processed = [] for ex in examples: chunks = chunk_text(ex.get("text", ""), max_chars) for chunk in chunks: processed.append({ "text": chunk, "source": ex.get("source", "unknown"), "domain": ex.get("domain", "unknown"), "level": ex.get("level", "unknown"), "added": ex.get("added", time.time()), }) return processed