Spaces:
Build error
Build error
FerrellSyntheticIntelligence
Add training pipeline: auto-generator (6 domains, 7 cognitive levels), processor (chunk/dedup), pipeline orchestrator, training dashboard UI tab
8a42496 | import re | |
| import time | |
| import hashlib | |
| def chunk_text(text, max_chars=500): | |
| paragraphs = re.split(r'\n\s*\n', text.strip()) | |
| chunks = [] | |
| for para in paragraphs: | |
| para = para.strip() | |
| if not para: | |
| continue | |
| if len(para) <= max_chars: | |
| chunks.append(para) | |
| else: | |
| sentences = re.split(r'(?<=[.!?])\s+', para) | |
| current = "" | |
| for sent in sentences: | |
| if len(current) + len(sent) + 1 <= max_chars: | |
| current += " " + sent if current else sent | |
| else: | |
| if current: | |
| chunks.append(current) | |
| current = sent | |
| if current: | |
| chunks.append(current) | |
| return chunks | |
| def deduplicate(examples): | |
| seen = set() | |
| unique = [] | |
| for ex in examples: | |
| text = ex.get("text", "").strip().lower() | |
| h = hashlib.md5(text.encode()).hexdigest() | |
| if h not in seen: | |
| seen.add(h) | |
| unique.append(ex) | |
| return unique | |
| def build_source_map(examples): | |
| sources = {} | |
| for ex in examples: | |
| src = ex.get("source", "unknown") | |
| if src not in sources: | |
| sources[src] = {"count": 0, "domains": set()} | |
| sources[src]["count"] += 1 | |
| if "domain" in ex: | |
| sources[src]["domains"].add(ex["domain"]) | |
| for s in sources.values(): | |
| s["domains"] = list(s["domains"]) | |
| return sources | |
| def process_training_batch(examples, max_chars=500): | |
| examples = deduplicate(examples) | |
| processed = [] | |
| for ex in examples: | |
| chunks = chunk_text(ex.get("text", ""), max_chars) | |
| for chunk in chunks: | |
| processed.append({ | |
| "text": chunk, | |
| "source": ex.get("source", "unknown"), | |
| "domain": ex.get("domain", "unknown"), | |
| "level": ex.get("level", "unknown"), | |
| "added": ex.get("added", time.time()), | |
| }) | |
| return processed | |