Instructions to use lchakkei/2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use lchakkei/2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="lchakkei/2") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("lchakkei/2") model = AutoModelForCausalLM.from_pretrained("lchakkei/2") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use lchakkei/2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "lchakkei/2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lchakkei/2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/lchakkei/2
- SGLang
How to use lchakkei/2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "lchakkei/2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lchakkei/2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "lchakkei/2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lchakkei/2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use lchakkei/2 with Docker Model Runner:
docker model run hf.co/lchakkei/2
| import torch | |
| import locale | |
| import os | |
| from typing import Dict, List, Any | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig | |
| from langchain.llms import HuggingFacePipeline | |
| from langchain.retrievers.document_compressors import LLMChainExtractor | |
| from langchain.retrievers import ContextualCompressionRetriever | |
| from langchain.vectorstores import Chroma | |
| from langchain import PromptTemplate, LLMChain | |
| from langchain.chains import RetrievalQA, ConversationalRetrievalChain | |
| from langchain.prompts import PromptTemplate | |
| from langchain.prompts.prompt import PromptTemplate | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.embeddings import HuggingFaceBgeEmbeddings | |
| from langchain.document_loaders import WebBaseLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chains.qa_with_sources import load_qa_with_sources_chain | |
| from langchain.chains.combine_documents import create_stuff_documents_chain | |
| from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder | |
| from langchain_core.messages import HumanMessage | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnableLambda, RunnableBranch, RunnablePassthrough | |
| from operator import itemgetter | |
| from langchain.schema import format_document | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string | |
| from langchain_core.runnables import RunnableParallel | |
| from typing import Optional | |
| from langchain.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores.utils import DistanceStrategy | |
| EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1" | |
| MARKDOWN_SEPARATORS = [ | |
| "\n#{1,6} ", | |
| "```\n", | |
| "\n\\*\\*\\*+\n", | |
| "\n---+\n", | |
| "\n___+\n", | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| class EndpointHandler(): | |
| def __init__(self, path=""): | |
| # Load Vector db | |
| urls = [ | |
| "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html", | |
| "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html", | |
| "https://www.cityu.edu.hk/media/press-release/2022/05/17/cityu-council-announces-appointment-professor-freddy-boey-next-president", | |
| "https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu" | |
| ] | |
| loader = WebBaseLoader(urls) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
| AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME), | |
| chunk_size=512, | |
| chunk_overlap=int(512 / 10), | |
| add_start_index=True, | |
| strip_whitespace=True, | |
| separators=MARKDOWN_SEPARATORS, | |
| ) | |
| docs_processed = [] | |
| for doc in docs: | |
| docs_processed += text_splitter.split_documents([doc]) | |
| # Remove duplicates | |
| unique_texts = {} | |
| docs_processed_unique = [] | |
| for doc in docs_processed: | |
| if doc.page_content not in unique_texts: | |
| unique_texts[doc.page_content] = True | |
| docs_processed_unique.append(doc) | |
| embedding_model = HuggingFaceEmbeddings( | |
| model_name=EMBEDDING_MODEL_NAME, | |
| multi_process=True, | |
| model_kwargs={"device": "cuda"}, | |
| encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity | |
| ) | |
| self.vectorstore = FAISS.from_documents( | |
| docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE | |
| ) | |
| # Create LLM | |
| READER_MODEL_NAME = path | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config) | |
| tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME) | |
| # Testing | |
| # tokenizer.pad_token = tokenizer.eos_token | |
| self.READER_LLM = pipeline( | |
| model=model, | |
| tokenizer=tokenizer, | |
| task="text-generation", | |
| do_sample=True, | |
| temperature=0.2, | |
| repetition_penalty=1.1, | |
| return_full_text=False, | |
| max_new_tokens=256, | |
| ) | |
| prompt_in_chat_format = [ | |
| { | |
| "role": "system", | |
| "content": """Using the information contained in the context. | |
| Respond only to the question asked, response should be concise and relevant to the question. | |
| If the answer cannot be deduced from the context, do not give an answer.""", | |
| }, | |
| { | |
| "role": "user", | |
| "content": """Context: {context} | |
| Now here is the question you need to answer. | |
| Question: {question}""", | |
| }, | |
| ] | |
| self.RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template( | |
| prompt_in_chat_format, tokenize=False, add_generation_prompt=True | |
| ) | |
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| # get inputs | |
| inputs = data.pop("inputs",data) | |
| date = data.pop("date", None) | |
| retrieved_docs = self.vectorstore.similarity_search(query=inputs, k=2) | |
| retrieved_docs_text = [ | |
| doc.page_content for doc in retrieved_docs | |
| ] # we only need the text of the documents | |
| context = "\nExtracted documents:\n" | |
| context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]) | |
| final_prompt = self.RAG_PROMPT_TEMPLATE.format( | |
| question=inputs, context=context | |
| ) | |
| # Redact an answer | |
| answer = self.READER_LLM(final_prompt)[0] | |
| return answer | |