Tabular Classification
Scikit-learn
Joblib
genomics
structural-variants
short-tandem-repeats
variant-calling
confidence-calibration
random-forest
Instructions to use khyeom/SVSTR-Score with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use khyeom/SVSTR-Score with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("khyeom/SVSTR-Score", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| { | |
| "model_name": "SV-SPR (caller-agnostic, reference-only)", | |
| "model_file": "model/svspr_v14_seq.pkl", | |
| "model_version": "v14_seq_only_unified", | |
| "model_sha256": "36b02f6249ec1858b21b0a9836590a833e7badb1d08e4b09b6e8a1908527dc46", | |
| "architecture": "sklearn.ensemble.RandomForestClassifier", | |
| "n_estimators": 200, | |
| "class_weight": "balanced", | |
| "trained_with_sklearn": "1.4.0", | |
| "verified_with_sklearn": "1.6.1", | |
| "random_seed": 42, | |
| "training": { | |
| "csv_source": "sv_matched_features_v7_depth20.csv", | |
| "cohort": "143 Korean parents (probands held out), paired Illumina DRAGEN + PacBio HiFi", | |
| "label": "confirmed = SRS Manta call matched by LRS Sawfish within +-500bp, same svtype", | |
| "common_sv_scope": "KPPD_AF >= 0.01 (diploid, n_alt/(2*226))", | |
| "train_cap_per_fold": 300000, | |
| "cross_validation": "143-fold sample-LOSO (LeaveOneGroupOut by sample_id)", | |
| "cv_f1_avg": 0.9593, | |
| "cv_f1_ci95": [0.9564, 0.9616], | |
| "cv_auroc": 0.9739, | |
| "comparison_v13_parents_FULL_f1": 0.9308 | |
| }, | |
| "n_features": 11, | |
| "feature_order": [ | |
| "svlen_abs_manta", | |
| "log10_svlen", | |
| "svtype_DEL_manta", | |
| "svtype_INS_manta", | |
| "svtype_DUP_manta", | |
| "svtype_BND_manta", | |
| "gc_flank_w100", | |
| "at_flank_w100", | |
| "gc_inner_w100", | |
| "n_motif_2_w100", | |
| "n_motif_3_w100" | |
| ], | |
| "features": { | |
| "svlen_abs_manta": {"dtype": "float", "desc": "abs(SVLEN) in bp", "source": "VCF INFO SVLEN or END-POS", "default": 0}, | |
| "log10_svlen": {"dtype": "float", "desc": "log10(svlen_abs + 1)", "source": "derived", "default": 0}, | |
| "svtype_DEL_manta": {"dtype": "int{0,1}", "desc": "one-hot DEL", "source": "VCF INFO SVTYPE", "default": 0}, | |
| "svtype_INS_manta": {"dtype": "int{0,1}", "desc": "one-hot INS", "source": "VCF INFO SVTYPE", "default": 0}, | |
| "svtype_DUP_manta": {"dtype": "int{0,1}", "desc": "one-hot DUP", "source": "VCF INFO SVTYPE", "default": 0}, | |
| "svtype_BND_manta": {"dtype": "int{0,1}", "desc": "one-hot BND (INV grouped here)", "source": "VCF INFO SVTYPE", "default": 0}, | |
| "gc_flank_w100": {"dtype": "float[0,1]", "desc": "GC fraction in +-100bp flanks (5'+3' averaged)", "source": "reference FASTA", "default": 0}, | |
| "at_flank_w100": {"dtype": "float[0,1]", "desc": "AT fraction in +-100bp flanks", "source": "reference FASTA", "default": 0}, | |
| "gc_inner_w100": {"dtype": "float[0,1]", "desc": "GC fraction inside DEL/DUP span (INS: insseq; fallback=gc_flank)", "source": "reference FASTA", "default": 0}, | |
| "n_motif_2_w100": {"dtype": "int", "desc": "count of dinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0}, | |
| "n_motif_3_w100": {"dtype": "int", "desc": "count of trinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0} | |
| }, | |
| "window_bp": 100, | |
| "required_inputs": ["VCF: chrom,pos,end/SVLEN,SVTYPE", "reference FASTA (GRCh38)"], | |
| "not_required": ["BAM/CRAM", "caller-specific INFO/FORMAT fields (QUAL, GQ, PR, SR, IMPRECISE)"], | |
| "output": {"CS": "P(confirmed by LRS), float[0,1]", "tier": "High>=0.9 | Moderate 0.7-0.9 | Warning 0.5-0.7 | Low<0.5 (matches Methods 2.7.2)"}, | |
| "calibration_note": "CS is the RandomForest positive-class probability and is NOT calibrated out-of-the-box (held-out ECE ~= 0.07; under-confident in mid-range). Apply isotonic/Platt calibration before treating CS as a literal probability." | |
| } | |