Instructions to use Georg4000/Octa with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- ESPnet
How to use Georg4000/Octa with ESPnet:
unknown model type (must be text-to-speech or automatic-speech-recognition)
- Notebooks
- Google Colab
- Kaggle
| from transformers import PreTrainedTokenizerFast | |
| from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models | |
| from tokenizers.normalizers import Lowercase, NFD, StripAccents | |
| from tokenizers.pre_tokenizers import Whitespace | |
| from typing import Optional, List, Union | |
| class OctagonTokenizer(PreTrainedTokenizerFast): | |
| def __init__( | |
| self, | |
| vocab_file=None, | |
| merges_file=None, | |
| tokenizer_file=None, | |
| unk_token="[UNK]", | |
| sep_token="[SEP]", | |
| pad_token="[PAD]", | |
| cls_token="[CLS]", | |
| mask_token="[MASK]", | |
| **kwargs | |
| ): | |
| super().__init__( | |
| tokenizer_file=tokenizer_file, | |
| unk_token=unk_token, | |
| sep_token=sep_token, | |
| pad_token=pad_token, | |
| cls_token=cls_token, | |
| mask_token=mask_token, | |
| **kwargs | |
| ) | |
| def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None): | |
| # Initialize a tokenizer | |
| tokenizer = Tokenizer(models.BPE()) | |
| # Normalizer | |
| tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) | |
| # Pre-tokenizer | |
| tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() | |
| # Trainer | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=vocab_size, | |
| special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] | |
| ) | |
| # Train the tokenizer | |
| tokenizer.train_from_iterator(texts, trainer=trainer) | |
| # Save if path is provided | |
| if save_path: | |
| tokenizer.save(save_path) | |
| return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer) |