Zack Saadioui
8/24/2024
1
2
3
bash
conda create --name langchain-gpu python=3.11
conda activate langchain-gpu
1
2
3
bash
pip install langchain
pip install llama-cpp-python --upgrade
1
2
bash
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir
1 2 3 4 5
conda create --name openllm python=3.11 conda activate openllm conda install openllm conda install "openllm[vllm]" openllm start facebook/opt-2.7b --backend vllm --port 3000
1
2
3
4
bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
pip install -r requirements.txt
1 2 3
llm_chain = LLMChain(prompt=prompt, llm=llama) response = llm_chain.run("What is the capital of France?") print(response)
1
nvidia-smi
1
2
bash
watch nvidia-smi
1 2 3
# For example batch_size = 16 # Test several batch sizes to find the sweet spot output = model.generate(input_ids, num_return_sequences=batch_size)
Copyright © Arsturn 2025