{ "model_name": "SV-SPR (caller-agnostic, reference-only)", "model_file": "model/svspr_v14_seq.pkl", "model_version": "v14_seq_only_unified", "model_sha256": "36b02f6249ec1858b21b0a9836590a833e7badb1d08e4b09b6e8a1908527dc46", "architecture": "sklearn.ensemble.RandomForestClassifier", "n_estimators": 200, "class_weight": "balanced", "trained_with_sklearn": "1.4.0", "verified_with_sklearn": "1.6.1", "random_seed": 42, "training": { "csv_source": "sv_matched_features_v7_depth20.csv", "cohort": "143 Korean parents (probands held out), paired Illumina DRAGEN + PacBio HiFi", "label": "confirmed = SRS Manta call matched by LRS Sawfish within +-500bp, same svtype", "common_sv_scope": "KPPD_AF >= 0.01 (diploid, n_alt/(2*226))", "train_cap_per_fold": 300000, "cross_validation": "143-fold sample-LOSO (LeaveOneGroupOut by sample_id)", "cv_f1_avg": 0.9593, "cv_f1_ci95": [0.9564, 0.9616], "cv_auroc": 0.9739, "comparison_v13_parents_FULL_f1": 0.9308 }, "n_features": 11, "feature_order": [ "svlen_abs_manta", "log10_svlen", "svtype_DEL_manta", "svtype_INS_manta", "svtype_DUP_manta", "svtype_BND_manta", "gc_flank_w100", "at_flank_w100", "gc_inner_w100", "n_motif_2_w100", "n_motif_3_w100" ], "features": { "svlen_abs_manta": {"dtype": "float", "desc": "abs(SVLEN) in bp", "source": "VCF INFO SVLEN or END-POS", "default": 0}, "log10_svlen": {"dtype": "float", "desc": "log10(svlen_abs + 1)", "source": "derived", "default": 0}, "svtype_DEL_manta": {"dtype": "int{0,1}", "desc": "one-hot DEL", "source": "VCF INFO SVTYPE", "default": 0}, "svtype_INS_manta": {"dtype": "int{0,1}", "desc": "one-hot INS", "source": "VCF INFO SVTYPE", "default": 0}, "svtype_DUP_manta": {"dtype": "int{0,1}", "desc": "one-hot DUP", "source": "VCF INFO SVTYPE", "default": 0}, "svtype_BND_manta": {"dtype": "int{0,1}", "desc": "one-hot BND (INV grouped here)", "source": "VCF INFO SVTYPE", "default": 0}, "gc_flank_w100": {"dtype": "float[0,1]", "desc": "GC fraction in +-100bp flanks (5'+3' averaged)", "source": "reference FASTA", "default": 0}, "at_flank_w100": {"dtype": "float[0,1]", "desc": "AT fraction in +-100bp flanks", "source": "reference FASTA", "default": 0}, "gc_inner_w100": {"dtype": "float[0,1]", "desc": "GC fraction inside DEL/DUP span (INS: insseq; fallback=gc_flank)", "source": "reference FASTA", "default": 0}, "n_motif_2_w100": {"dtype": "int", "desc": "count of dinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0}, "n_motif_3_w100": {"dtype": "int", "desc": "count of trinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0} }, "window_bp": 100, "required_inputs": ["VCF: chrom,pos,end/SVLEN,SVTYPE", "reference FASTA (GRCh38)"], "not_required": ["BAM/CRAM", "caller-specific INFO/FORMAT fields (QUAL, GQ, PR, SR, IMPRECISE)"], "output": {"CS": "P(confirmed by LRS), float[0,1]", "tier": "High>=0.9 | Moderate 0.7-0.9 | Warning 0.5-0.7 | Low<0.5 (matches Methods 2.7.2)"}, "calibration_note": "CS is the RandomForest positive-class probability and is NOT calibrated out-of-the-box (held-out ECE ~= 0.07; under-confident in mid-range). Apply isotonic/Platt calibration before treating CS as a literal probability." }