Instructions to use lchakkei/2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use lchakkei/2 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="lchakkei/2")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("lchakkei/2")
model = AutoModelForCausalLM.from_pretrained("lchakkei/2")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use lchakkei/2 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "lchakkei/2"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "lchakkei/2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/lchakkei/2

SGLang

How to use lchakkei/2 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "lchakkei/2" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "lchakkei/2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "lchakkei/2" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "lchakkei/2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use lchakkei/2 with Docker Model Runner:
```
docker model run hf.co/lchakkei/2
```

2 / handler.py

lchakkei

Update handler.py

0c29b88 verified about 2 years ago

raw

history blame contribute delete

6.17 kB

	import torch
	import locale
	import os
	from typing import Dict, List, Any
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
	from langchain.llms import HuggingFacePipeline
	from langchain.retrievers.document_compressors import LLMChainExtractor
	from langchain.retrievers import ContextualCompressionRetriever
	from langchain.vectorstores import Chroma
	from langchain import PromptTemplate, LLMChain
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain
	from langchain.prompts import PromptTemplate
	from langchain.prompts.prompt import PromptTemplate
	from langchain.memory import ConversationBufferMemory
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain.document_loaders import WebBaseLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.qa_with_sources import load_qa_with_sources_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain_core.messages import HumanMessage
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnableLambda, RunnableBranch, RunnablePassthrough
	from operator import itemgetter
	from langchain.schema import format_document
	from langchain.memory import ConversationBufferMemory
	from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
	from langchain_core.runnables import RunnableParallel
	from typing import Optional
	from langchain.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores.utils import DistanceStrategy

	EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
	MARKDOWN_SEPARATORS = [
	"\n#{1,6} ",
	"```\n",
	"\n\\\\\\*+\n",
	"\n---+\n",
	"\n___+\n",
	"\n\n",
	"\n",
	" ",
	"",
	]

	class EndpointHandler():

	def __init__(self, path=""):
	# Load Vector db
	urls = [
	"https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html",
	"https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html",
	"https://www.cityu.edu.hk/media/press-release/2022/05/17/cityu-council-announces-appointment-professor-freddy-boey-next-president",
	"https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu"
	]

	loader = WebBaseLoader(urls)
	docs = loader.load()

	text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
	AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
	chunk_size=512,
	chunk_overlap=int(512 / 10),
	add_start_index=True,
	strip_whitespace=True,
	separators=MARKDOWN_SEPARATORS,
	)

	docs_processed = []
	for doc in docs:
	docs_processed += text_splitter.split_documents([doc])

	# Remove duplicates
	unique_texts = {}
	docs_processed_unique = []
	for doc in docs_processed:
	if doc.page_content not in unique_texts:
	unique_texts[doc.page_content] = True
	docs_processed_unique.append(doc)

	embedding_model = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL_NAME,
	multi_process=True,
	model_kwargs={"device": "cuda"},
	encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
	)

	self.vectorstore = FAISS.from_documents(
	docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE
	)

	# Create LLM
	READER_MODEL_NAME = path

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
	tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

	# Testing
	# tokenizer.pad_token = tokenizer.eos_token

	self.READER_LLM = pipeline(
	model=model,
	tokenizer=tokenizer,
	task="text-generation",
	do_sample=True,
	temperature=0.2,
	repetition_penalty=1.1,
	return_full_text=False,
	max_new_tokens=256,
	)

	prompt_in_chat_format = [
	{
	"role": "system",
	"content": """Using the information contained in the context.
	Respond only to the question asked, response should be concise and relevant to the question.
	If the answer cannot be deduced from the context, do not give an answer.""",
	},
	{
	"role": "user",
	"content": """Context: {context}
	Now here is the question you need to answer.
	Question: {question}""",
	},
	]

	self.RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
	prompt_in_chat_format, tokenize=False, add_generation_prompt=True
	)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	# get inputs
	inputs = data.pop("inputs",data)
	date = data.pop("date", None)

	retrieved_docs = self.vectorstore.similarity_search(query=inputs, k=2)

	retrieved_docs_text = [
	doc.page_content for doc in retrieved_docs
	] # we only need the text of the documents
	context = "\nExtracted documents:\n"
	context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

	final_prompt = self.RAG_PROMPT_TEMPLATE.format(
	question=inputs, context=context
	)

	# Redact an answer
	answer = self.READER_LLM(final_prompt)[0]

	return answer