41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
from llm_inference import load_chat_llama_model, utter
|
|
|
|
model_path = "/home/gregory/programming/testWithPython/Daredevil-GGUF/daredevil-8b-abliterated.Q8_0.gguf"
|
|
llm = load_chat_llama_model(model_path)
|
|
|
|
system_initial_prompt = "You are a helpful AI assistant. You like to share your knowledge with user"
|
|
|
|
context = [
|
|
{"role": "system", "content": system_initial_prompt},
|
|
]
|
|
|
|
print("[Model loaded]")
|
|
while True:
|
|
utter("=> ")
|
|
user_prompt = input()
|
|
context.append({"role": "user", "content": user_prompt})
|
|
pieces: list[str] = []
|
|
try:
|
|
chunk_stream = llm.create_chat_completion(
|
|
messages=context,
|
|
temperature=0.7,
|
|
top_k=40,
|
|
top_p=0.95,
|
|
repeat_penalty=1.0,
|
|
max_tokens=999999999,
|
|
stream=True,
|
|
)
|
|
for chunk in chunk_stream:
|
|
delta = chunk["choices"][0]["delta"]
|
|
piece = delta.get("content")
|
|
|
|
if not piece:
|
|
continue
|
|
|
|
pieces.append(piece)
|
|
utter(piece)
|
|
except KeyboardInterrupt:
|
|
print("\n[stopped]")
|
|
utter("\n")
|
|
context.append({"role": "assistant", "content": "".join(pieces)})
|