Zack Saadioui
8/24/2024
1
2
3
bash
conda create --name langchain-gpu python=3.11
conda activate langchain-gpu
1
2
3
bash
pip install langchain
pip install llama-cpp-python --upgrade
1
2
bash
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir
1 2 3 4 5
conda create --name openllm python=3.11 conda activate openllm conda install openllm conda install "openllm[vllm]" openllm start facebook/opt-2.7b --backend vllm --port 3000
1
2
3
4
bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
pip install -r requirements.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
from langchain import PromptTemplate, LLMChain from langchain.llms import LlamaCpp # Define the model path model_path = "[Your model path]" def run_chatbot(): llama = LlamaCpp(model_path=model_path) template = """Question: {question} \nAnswer: Let's work step step to ensure the right answer." prompt = PromptTemplate(template=template, input_variables=["question"]) llm_chain = LLMChain(prompt=prompt, llm=llama) response = llm_chain.run("What is the capital of France?") print(response) if __name__ == '__main__': run_chatbot()
1
nvidia-smi
1
2
bash
watch nvidia-smi
1 2 3
# For example batch_size = 16 # Test several batch sizes to find the sweet spot output = model.generate(input_ids, num_return_sequences=batch_size)
Copyright © Arsturn 2025