Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- refactor: Replace deprecated llama.cpp references in library, docs, and examples by @abetlen in #2170
- feat: Update llama.cpp to ggerganov/llama.cpp@f49e9178767d557a522618b16ce8694f9ddac628 by @abetlen in #2169
- feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
- ci: Publish release wheels as `py3-none` by @Bing-su in #2166
- fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
Expand Down
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -717,16 +717,20 @@ Below is a short example demonstrating how to use the low-level API to tokenize
```python
import llama_cpp
import ctypes
llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
params = llama_cpp.llama_context_default_params()
llama_cpp.llama_backend_init() # Must be called once at the start of each program
model_params = llama_cpp.llama_model_default_params()
ctx_params = llama_cpp.llama_context_default_params()
prompt = b"Q: Name the planets in the solar system? A: "
# use bytes for char * params
model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
ctx = llama_cpp.llama_new_context_with_model(model, params)
max_tokens = params.n_ctx
model = llama_cpp.llama_model_load_from_file(b"./models/7b/llama-model.gguf", model_params)
ctx = llama_cpp.llama_init_from_model(model, ctx_params)
vocab = llama_cpp.llama_model_get_vocab(model)
max_tokens = ctx_params.n_ctx
# use ctypes arrays for array params
tokens = (llama_cpp.llama_token * int(max_tokens))()
n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
n_tokens = llama_cpp.llama_tokenize(vocab, prompt, len(prompt), tokens, max_tokens, True, False)
llama_cpp.llama_free(ctx)
llama_cpp.llama_model_free(model)
```

Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
Expand Down
4 changes: 2 additions & 2 deletions examples/batch-processing/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
# path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"

# model_params = llama_cpp.llama_model_default_params()
# model = llama_cpp.llama_load_model_from_file(path, model_params)
# model = llama_cpp.llama_model_load_from_file(path, model_params)

# if model is None:
# raise RuntimeError(f"Failed to load model from file: {path}")


# ctx_params = llama_cpp.llama_context_default_params()
# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
# ctx = llama_cpp.llama_init_from_model(model, ctx_params)

# if ctx is None:
# raise RuntimeError("Failed to create context")
Expand Down
31 changes: 17 additions & 14 deletions examples/low_level_api/low_level_api_chat_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,19 +79,22 @@ def __init__(self, params: GptParams) -> None:
self.lparams.use_mlock = self.params.use_mlock
self.lparams.use_mmap = self.params.use_mmap

self.model = llama_cpp.llama_load_model_from_file(
self.model = llama_cpp.llama_model_load_from_file(
self.params.model.encode("utf8"), self.lparams
)
self.vocab = llama_cpp.llama_model_get_vocab(self.model)

# Context Params.
self.cparams = llama_cpp.llama_context_default_params()

self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
self.ctx = llama_cpp.llama_init_from_model(self.model, self.cparams)
if not self.ctx:
raise RuntimeError(f"error: failed to load model '{self.params.model}'")

if self.params.ignore_eos:
self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
self.params.logit_bias[llama_cpp.llama_vocab_eos(self.vocab)] = -float(
"inf"
)

if len(self.params.lora_adapter) > 0:
if (
Expand Down Expand Up @@ -153,7 +156,7 @@ def __init__(self, params: GptParams) -> None:
_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
_n_token_count_out = llama_cpp.c_size_t()
if (
llama_cpp.llama_load_session_file(
llama_cpp.llama_state_load_file(
self.ctx,
self.params.path_session.encode("utf8"),
_session_tokens,
Expand Down Expand Up @@ -314,7 +317,7 @@ def __init__(self, params: GptParams) -> None:
def _tokenize(self, prompt, bos=True):
_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
_n = llama_cpp.llama_tokenize(
self.model,
self.vocab,
prompt.encode("utf8", errors="ignore"),
len(prompt),
_arr,
Expand Down Expand Up @@ -406,7 +409,7 @@ def generate(self):
if len(self.embd_inp) <= self.input_consumed: # && !is_interacting
# out of user input, sample next token
top_k = (
llama_cpp.llama_n_vocab(self.ctx)
llama_cpp.llama_vocab_n_tokens(self.vocab)
if self.params.top_k <= 0
else self.params.top_k
)
Expand All @@ -419,7 +422,7 @@ def generate(self):
# optionally save the session on first sample (for faster prompt loading next time)
if len(self.params.path_session) > 0 and self.need_to_save_session:
self.need_to_save_session = False
llama_cpp.llama_save_session_file(
llama_cpp.llama_state_save_file(
self.ctx,
self.params.path_session.encode("utf8"),
(llama_cpp.llama_token * len(self.session_tokens))(
Expand All @@ -431,7 +434,7 @@ def generate(self):
id = 0

logits = llama_cpp.llama_get_logits(self.ctx)
n_vocab = llama_cpp.llama_n_vocab(self.model)
n_vocab = llama_cpp.llama_vocab_n_tokens(self.vocab)

# Apply params.logit_bias map
for key, value in self.params.logit_bias.items():
Expand All @@ -448,7 +451,7 @@ def generate(self):
)

# Apply penalties
nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
nl_logit = logits[llama_cpp.llama_vocab_nl(self.vocab)]
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)

_arr = (llama_cpp.llama_token * last_n_repeat)(
Expand All @@ -470,7 +473,7 @@ def generate(self):
# last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))

if not self.params.penalize_nl:
logits[llama_cpp.llama_token_nl()] = nl_logit
logits[llama_cpp.llama_vocab_nl(self.vocab)] = nl_logit

if self.params.temp <= 0:
# Greedy sampling
Expand Down Expand Up @@ -539,7 +542,7 @@ def generate(self):

# replace end of text token with newline token when in interactive mode
if (
id == llama_cpp.llama_token_eos(self.ctx)
id == llama_cpp.llama_vocab_eos(self.vocab)
and self.params.interactive
and not self.params.instruct
):
Expand Down Expand Up @@ -599,8 +602,8 @@ def generate(self):
break

# end of text token
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
self.ctx
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_vocab_eos(
self.vocab
):
if not self.params.instruct:
for i in self.llama_token_eot:
Expand Down Expand Up @@ -636,7 +639,7 @@ def token_to_str(self, token_id: int) -> bytes:
size = 32
buffer = (ctypes.c_char * size)()
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token_id), buffer, size
self.vocab, llama_cpp.llama_token(token_id), buffer, size, 0, False
)
assert n <= size
return bytes(buffer[:n])
Expand Down
26 changes: 14 additions & 12 deletions examples/low_level_api/low_level_api_llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import llama_cpp

llama_cpp.llama_backend_init(numa=False)
llama_cpp.llama_backend_init()

N_THREADS = multiprocessing.cpu_count()
MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin")
Expand All @@ -13,8 +13,9 @@

lparams = llama_cpp.llama_model_default_params()
cparams = llama_cpp.llama_context_default_params()
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
ctx = llama_cpp.llama_new_context_with_model(model, cparams)
model = llama_cpp.llama_model_load_from_file(MODEL_PATH.encode("utf-8"), lparams)
ctx = llama_cpp.llama_init_from_model(model, cparams)
vocab = llama_cpp.llama_model_get_vocab(model)

# determine the required inference memory per token:
tmp = [0, 1, 2, 3]
Expand All @@ -28,13 +29,13 @@

embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
n_of_tok = llama_cpp.llama_tokenize(
model=model,
text=bytes(str(prompt), "utf-8"),
text_len=len(embd_inp),
vocab=vocab,
text=prompt,
text_len=len(prompt),
tokens=embd_inp,
n_max_tokens=len(embd_inp),
add_bos=False,
special=False,
n_tokens_max=len(embd_inp),
add_special=False,
parse_special=False,
)
embd_inp = embd_inp[:n_of_tok]

Expand Down Expand Up @@ -70,7 +71,7 @@
embd = []
if len(embd_inp) <= input_consumed:
logits = llama_cpp.llama_get_logits(ctx)
n_vocab = llama_cpp.llama_n_vocab(model)
n_vocab = llama_cpp.llama_vocab_n_tokens(vocab)

_arr = (llama_cpp.llama_token_data * n_vocab)(
*[
Expand Down Expand Up @@ -114,7 +115,7 @@
size = 32
buffer = (ctypes.c_char * size)()
n = llama_cpp.llama_token_to_piece(
model, llama_cpp.llama_token(id), buffer, size
vocab, llama_cpp.llama_token(id), buffer, size, 0, False
)
assert n <= size
print(
Expand All @@ -123,11 +124,12 @@
flush=True,
)

if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
if len(embd) > 0 and embd[-1] == llama_cpp.llama_vocab_eos(vocab):
break

print()

llama_cpp.llama_print_timings(ctx)

llama_cpp.llama_free(ctx)
llama_cpp.llama_model_free(model)
15 changes: 8 additions & 7 deletions examples/notebooks/Batching.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,10 @@
"source": [
"params = llama_cpp.llama_model_default_params()\n",
"params.n_gpu_layers = 35\n",
"model = llama_cpp.llama_load_model_from_file(\n",
"model = llama_cpp.llama_model_load_from_file(\n",
" b\"/workspaces/llama-cpp-python/mistral-7b-v0.1.Q2_K.gguf\", params\n",
") # Update this to whatever"
") # Update this to whatever\n",
"vocab = llama_cpp.llama_model_get_vocab(model)"
]
},
{
Expand All @@ -149,7 +150,7 @@
"\n",
"tokens = (llama_cpp.llama_token * n_ctx)()\n",
"tokens_len = llama_cpp.llama_tokenize(\n",
" model, prompt, len(prompt), tokens, len(tokens), True, True\n",
" vocab, prompt, len(prompt), tokens, len(tokens), True, True\n",
")\n",
"print(tokens[:tokens_len])\n",
"\n",
Expand Down Expand Up @@ -188,7 +189,7 @@
"ctx_params.n_batch = max(n_len, n_parallel)\n",
"ctx_params.n_threads = 1\n",
"ctx_params.n_threads_batch = 1\n",
"ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)"
"ctx = llama_cpp.llama_init_from_model(model, ctx_params)"
]
},
{
Expand Down Expand Up @@ -338,14 +339,14 @@
" # Sample the next token using the sampler chain\n",
" new_token_id = llama_cpp.llama_sampler_sample(sampler_chain, ctx, -1)\n",
"\n",
" if new_token_id == llama_cpp.llama_token_eos(ctx) or n_cur == n_len:\n",
" if new_token_id == llama_cpp.llama_vocab_eos(vocab) or n_cur == n_len:\n",
" i_batch[i] = -1\n",
" continue\n",
"\n",
" buf = (ctypes.c_char * 32)()\n",
" \n",
" # Convert token ID to text\n",
" outlen = llama_cpp.llama_token_to_piece(model, new_token_id, buf, len(buf), 0, False)\n",
" outlen = llama_cpp.llama_token_to_piece(vocab, new_token_id, buf, len(buf), 0, False)\n",
" streams[i] += bytes(buf[:outlen]).decode(\"utf-8\")\n",
"\n",
" batch.token[batch.n_tokens] = new_token_id\n",
Expand Down Expand Up @@ -411,7 +412,7 @@
"metadata": {},
"outputs": [],
"source": [
"llama_cpp.llama_free_model(model)"
"llama_cpp.llama_model_free(model)"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def token_eos(self) -> int:
return llama_cpp.llama_vocab_eos(self.vocab)

def token_cls(self) -> int:
return llama_cpp.llama_vocab_cls(self.vocab)
return llama_cpp.llama_vocab_bos(self.vocab)

def token_sep(self) -> int:
return llama_cpp.llama_vocab_sep(self.vocab)
Expand Down Expand Up @@ -317,9 +317,9 @@ def get_state_size(self) -> int:

# TODO: set_state_data

# TODO: llama_load_session_file
# TODO: llama_state_load_file

# TODO: llama_save_session_file
# TODO: llama_state_save_file

def decode(self, batch: LlamaBatch):
return_code = llama_cpp.llama_decode(
Expand Down
11 changes: 7 additions & 4 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,7 @@ def logit_bias_processor(
logits_processor=logits_processor,
grammar=grammar,
):
if llama_cpp.llama_token_is_eog(self._model.vocab, token):
if llama_cpp.llama_vocab_is_eog(self._model.vocab, token):
text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
finish_reason = "stop"
break
Expand Down Expand Up @@ -2148,13 +2148,13 @@ def __setstate__(self, state):
def save_state(self) -> LlamaState:
if self.verbose:
print("Llama.save_state: saving llama state", file=sys.stderr)
state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
state_size = llama_cpp.llama_state_get_size(self._ctx.ctx)
if self.verbose:
print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
llama_state = (ctypes.c_uint8 * int(state_size))()
if self.verbose:
print("Llama.save_state: allocated state", file=sys.stderr)
n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size)
if self.verbose:
print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
if int(n_bytes) > int(state_size):
Expand Down Expand Up @@ -2187,7 +2187,10 @@ def load_state(self, state: LlamaState) -> None:
LLamaStateArrayType = ctypes.c_uint8 * state_size
llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)

if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
if (
llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size)
!= state_size
):
raise RuntimeError("Failed to set llama state data")

def n_ctx(self) -> int:
Expand Down
Loading