from llm_inference import load_chat_llama_model, utter model_path = "/home/gregory/programming/testWithPython/Daredevil-GGUF/daredevil-8b-abliterated.Q8_0.gguf" llm = load_chat_llama_model(model_path) system_initial_prompt = "You are a helpful AI assistant. You like to share your knowledge with user" context = [ {"role": "system", "content": system_initial_prompt}, ] print("[Model loaded]") while True: utter("=> ") user_prompt = input() context.append({"role": "user", "content": user_prompt}) pieces: list[str] = [] try: chunk_stream = llm.create_chat_completion( messages=context, temperature=0.7, top_k=40, top_p=0.95, repeat_penalty=1.0, max_tokens=999999999, stream=True, ) for chunk in chunk_stream: delta = chunk["choices"][0]["delta"] piece = delta.get("content") if not piece: continue pieces.append(piece) utter(piece) except KeyboardInterrupt: print("\n[stopped]") utter("\n") context.append({"role": "assistant", "content": "".join(pieces)})