SVSTR-Score / seqonly /feature_schema.json
khyeom's picture
Add sequence-only headline model (svspr_v14_seq, 11-feature) + inference package
90d0b4b verified
Raw
History Blame Contribute Delete
3.37 kB
{
"model_name": "SV-SPR (caller-agnostic, reference-only)",
"model_file": "model/svspr_v14_seq.pkl",
"model_version": "v14_seq_only_unified",
"model_sha256": "36b02f6249ec1858b21b0a9836590a833e7badb1d08e4b09b6e8a1908527dc46",
"architecture": "sklearn.ensemble.RandomForestClassifier",
"n_estimators": 200,
"class_weight": "balanced",
"trained_with_sklearn": "1.4.0",
"verified_with_sklearn": "1.6.1",
"random_seed": 42,
"training": {
"csv_source": "sv_matched_features_v7_depth20.csv",
"cohort": "143 Korean parents (probands held out), paired Illumina DRAGEN + PacBio HiFi",
"label": "confirmed = SRS Manta call matched by LRS Sawfish within +-500bp, same svtype",
"common_sv_scope": "KPPD_AF >= 0.01 (diploid, n_alt/(2*226))",
"train_cap_per_fold": 300000,
"cross_validation": "143-fold sample-LOSO (LeaveOneGroupOut by sample_id)",
"cv_f1_avg": 0.9593,
"cv_f1_ci95": [0.9564, 0.9616],
"cv_auroc": 0.9739,
"comparison_v13_parents_FULL_f1": 0.9308
},
"n_features": 11,
"feature_order": [
"svlen_abs_manta",
"log10_svlen",
"svtype_DEL_manta",
"svtype_INS_manta",
"svtype_DUP_manta",
"svtype_BND_manta",
"gc_flank_w100",
"at_flank_w100",
"gc_inner_w100",
"n_motif_2_w100",
"n_motif_3_w100"
],
"features": {
"svlen_abs_manta": {"dtype": "float", "desc": "abs(SVLEN) in bp", "source": "VCF INFO SVLEN or END-POS", "default": 0},
"log10_svlen": {"dtype": "float", "desc": "log10(svlen_abs + 1)", "source": "derived", "default": 0},
"svtype_DEL_manta": {"dtype": "int{0,1}", "desc": "one-hot DEL", "source": "VCF INFO SVTYPE", "default": 0},
"svtype_INS_manta": {"dtype": "int{0,1}", "desc": "one-hot INS", "source": "VCF INFO SVTYPE", "default": 0},
"svtype_DUP_manta": {"dtype": "int{0,1}", "desc": "one-hot DUP", "source": "VCF INFO SVTYPE", "default": 0},
"svtype_BND_manta": {"dtype": "int{0,1}", "desc": "one-hot BND (INV grouped here)", "source": "VCF INFO SVTYPE", "default": 0},
"gc_flank_w100": {"dtype": "float[0,1]", "desc": "GC fraction in +-100bp flanks (5'+3' averaged)", "source": "reference FASTA", "default": 0},
"at_flank_w100": {"dtype": "float[0,1]", "desc": "AT fraction in +-100bp flanks", "source": "reference FASTA", "default": 0},
"gc_inner_w100": {"dtype": "float[0,1]", "desc": "GC fraction inside DEL/DUP span (INS: insseq; fallback=gc_flank)", "source": "reference FASTA", "default": 0},
"n_motif_2_w100": {"dtype": "int", "desc": "count of dinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0},
"n_motif_3_w100": {"dtype": "int", "desc": "count of trinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0}
},
"window_bp": 100,
"required_inputs": ["VCF: chrom,pos,end/SVLEN,SVTYPE", "reference FASTA (GRCh38)"],
"not_required": ["BAM/CRAM", "caller-specific INFO/FORMAT fields (QUAL, GQ, PR, SR, IMPRECISE)"],
"output": {"CS": "P(confirmed by LRS), float[0,1]", "tier": "High>=0.9 | Moderate 0.7-0.9 | Warning 0.5-0.7 | Low<0.5 (matches Methods 2.7.2)"},
"calibration_note": "CS is the RandomForest positive-class probability and is NOT calibrated out-of-the-box (held-out ECE ~= 0.07; under-confident in mid-range). Apply isotonic/Platt calibration before treating CS as a literal probability."
}