41 lines
1.2 KiB
Python

from llm_inference import load_chat_llama_model, utter
model_path = "/home/gregory/programming/testWithPython/Daredevil-GGUF/daredevil-8b-abliterated.Q8_0.gguf"
llm = load_chat_llama_model(model_path)
system_initial_prompt = "You are a helpful AI assistant. You like to share your knowledge with user"
context = [
{"role": "system", "content": system_initial_prompt},
]
print("[Model loaded]")
while True:
utter("=> ")
user_prompt = input()
context.append({"role": "user", "content": user_prompt})
pieces: list[str] = []
try:
chunk_stream = llm.create_chat_completion(
messages=context,
temperature=0.7,
top_k=40,
top_p=0.95,
repeat_penalty=1.0,
max_tokens=999999999,
stream=True,
)
for chunk in chunk_stream:
delta = chunk["choices"][0]["delta"]
piece = delta.get("content")
if not piece:
continue
pieces.append(piece)
utter(piece)
except KeyboardInterrupt:
print("\n[stopped]")
utter("\n")
context.append({"role": "assistant", "content": "".join(pieces)})