import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "1" from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.llms import LlamaCpp callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) def get_mixtral(): mixtral8x7b = LlamaCpp( model_path="model/weight/mixtral-8x7b-instruct-v0.1.Q6_K.gguf", temperature=0.75, n_gpu_layers=33, n_ctx=20000, n_thread=30, n_batch=32, max_tokens=2024, top_p=3, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) return mixtral8x7b def get_nerualchat7bv3_2(): neuralchat7bv3_2 = LlamaCpp( model_path="model/weight/neural-chat-7b-v3-2.Q5_K_M.gguf", temperature=0.75, n_gpu_layers=33, n_ctx=20000, n_thread=30, n_batch=32, max_tokens=512, top_p=1, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) return neuralchat7bv3_2