Vitalis_Core_UI / training /processor.py
FerrellSyntheticIntelligence
Add training pipeline: auto-generator (6 domains, 7 cognitive levels), processor (chunk/dedup), pipeline orchestrator, training dashboard UI tab
8a42496
Raw
History Blame Contribute Delete
2.02 kB
import re
import time
import hashlib
def chunk_text(text, max_chars=500):
paragraphs = re.split(r'\n\s*\n', text.strip())
chunks = []
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(para) <= max_chars:
chunks.append(para)
else:
sentences = re.split(r'(?<=[.!?])\s+', para)
current = ""
for sent in sentences:
if len(current) + len(sent) + 1 <= max_chars:
current += " " + sent if current else sent
else:
if current:
chunks.append(current)
current = sent
if current:
chunks.append(current)
return chunks
def deduplicate(examples):
seen = set()
unique = []
for ex in examples:
text = ex.get("text", "").strip().lower()
h = hashlib.md5(text.encode()).hexdigest()
if h not in seen:
seen.add(h)
unique.append(ex)
return unique
def build_source_map(examples):
sources = {}
for ex in examples:
src = ex.get("source", "unknown")
if src not in sources:
sources[src] = {"count": 0, "domains": set()}
sources[src]["count"] += 1
if "domain" in ex:
sources[src]["domains"].add(ex["domain"])
for s in sources.values():
s["domains"] = list(s["domains"])
return sources
def process_training_batch(examples, max_chars=500):
examples = deduplicate(examples)
processed = []
for ex in examples:
chunks = chunk_text(ex.get("text", ""), max_chars)
for chunk in chunks:
processed.append({
"text": chunk,
"source": ex.get("source", "unknown"),
"domain": ex.get("domain", "unknown"),
"level": ex.get("level", "unknown"),
"added": ex.get("added", time.time()),
})
return processed