Koda Docs

from koda import Koda

k = Koda()

Pull a Model

# Download a model (no-op if already present)
k.pull("llama3.2")

Chat

reply = k.chat("llama3.2", [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is 2 + 2?"},
])
print(reply)

Streaming Chat

for token in k.chat("llama3.2", [{"role": "user", "content": "Tell me a story"}], stream=True):
    print(token, end="", flush=True)

Text Completion

# Raw text completion (no chat template applied)
text = k.generate("llama3.2", "The capital of France is")
print(text)

Streaming Completion

for token in k.generate("llama3.2", "Once upon a time", stream=True):
    print(token, end="", flush=True)

Model Management

print(k.models())    # all models in the registry
print(k.loaded())    # models currently in memory
k.unload("llama3.2") # free memory

Custom Context Window

k = Koda(n_ctx=8192)

Python Library

Pull a Model

Chat

Streaming Chat

Text Completion

Streaming Completion

Model Management

Custom Context Window

On this page