Text Classification
Model2Vec
Safetensors
English
pubguard
document-classification
scientific-papers
ai-detection
toxicity-detection
pubverse
publication-screening
quality-control
Instructions to use jimnoneill/pub-guard with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Model2Vec
How to use jimnoneill/pub-guard with Model2Vec:
from model2vec import StaticModel model = StaticModel.from_pretrained("jimnoneill/pub-guard") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| PubGuard gate for pipeline integration. | |
| Reads extracted PDF text from stdin or a file, screens it, and: | |
| - Prints verdict JSON to STDERR (for debugging) | |
| - Prints PASS/FAIL to STDERR | |
| - Exits 0 (pass) or 1 (fail) | |
| Usage: | |
| echo "$PDF_TEXT" | python3 pub_check/scripts/pubguard_gate.py | |
| Environment variables: | |
| PUBGUARD_MODELS_DIR β Override models directory | |
| PUBGUARD_STRICT β Set to "0" to warn instead of gate (exit 0 always) | |
| """ | |
| import json | |
| import sys | |
| import os | |
| import logging | |
| logging.basicConfig( | |
| level=logging.WARNING, | |
| format="%(asctime)s | %(levelname)s | %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| from pubguard import PubGuard, PubGuardConfig | |
| def main(): | |
| if len(sys.argv) > 1 and sys.argv[1] != "-": | |
| with open(sys.argv[1], errors="replace") as f: | |
| text = f.read() | |
| else: | |
| text = sys.stdin.read() | |
| if not text.strip(): | |
| print("PUBGUARD: Empty input", file=sys.stderr) | |
| sys.exit(1) | |
| config = PubGuardConfig() | |
| strict = os.environ.get("PUBGUARD_STRICT", "1") != "0" | |
| guard = PubGuard(config=config) | |
| guard.initialize() | |
| verdict = guard.screen(text) | |
| print(json.dumps(verdict), file=sys.stderr) | |
| if verdict["pass"]: | |
| print("PUBGUARD: PASS", file=sys.stderr) | |
| sys.exit(0) | |
| else: | |
| reasons = [] | |
| if verdict["doc_type"]["label"] != "scientific_paper": | |
| reasons.append(f"doc_type={verdict['doc_type']['label']}") | |
| if verdict["ai_generated"]["label"] == "ai_generated": | |
| reasons.append(f"ai_generated (score={verdict['ai_generated']['score']:.2f})") | |
| if verdict["toxicity"]["label"] == "toxic": | |
| reasons.append(f"toxic (score={verdict['toxicity']['score']:.2f})") | |
| print(f"PUBGUARD: FAIL β {', '.join(reasons)}", file=sys.stderr) | |
| if strict: | |
| sys.exit(1) | |
| else: | |
| print("PUBGUARD: Running in non-strict mode, continuing...", file=sys.stderr) | |
| sys.exit(0) | |
| if __name__ == "__main__": | |
| main() | |