Python Library
Use Koda directly in your Python code. No server, no daemon, no subprocess.
from koda import Koda
k = Koda()Pull a Model
# Download a model (no-op if already present)
k.pull("llama3.2")Chat
reply = k.chat("llama3.2", [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is 2 + 2?"},
])
print(reply)Streaming Chat
for token in k.chat("llama3.2", [{"role": "user", "content": "Tell me a story"}], stream=True):
print(token, end="", flush=True)Text Completion
# Raw text completion (no chat template applied)
text = k.generate("llama3.2", "The capital of France is")
print(text)Streaming Completion
for token in k.generate("llama3.2", "Once upon a time", stream=True):
print(token, end="", flush=True)Model Management
print(k.models()) # all models in the registry
print(k.loaded()) # models currently in memory
k.unload("llama3.2") # free memoryCustom Context Window
k = Koda(n_ctx=8192)