structllama_model_params { // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) ggml_backend_dev_t * devices;
int32_t n_gpu_layers; // number of layers to store in VRAM enumllama_split_mode split_mode; // how to split the model across multiple GPUs
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() constfloat * tensor_split;
// comma separated list of RPC servers to use for offloading constchar * rpc_servers;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. // If it returns false, model loading is immediately aborted. llama_progress_callback progress_callback;
// context pointer passed to the progress callback void * progress_callback_user_data;
// override key-value pairs of the model meta data conststructllama_model_kv_override * kv_overrides;
// Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data };
structllama_context_params { uint32_t n_ctx; // text context, 0 = from model uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing
enumllama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enumllama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enumllama_attention_type attention_type; // attention type to use for embeddings
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model float yarn_attn_factor; // YaRN magnitude scaling factor float yarn_beta_fast; // YaRN low correction dim float yarn_beta_slow; // YaRN high correction dim uint32_t yarn_orig_ctx; // YaRN original context size float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
enumggml_type type_k; // data type for K cache [EXPERIMENTAL] enumggml_type type_v; // data type for V cache [EXPERIMENTAL]
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. // TODO: move at the end of the struct bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings
// Abort callback // if it returns true, execution of llama_decode() will be aborted // currently works only with CPU execution ggml_abort_callback abort_callback; void * abort_callback_data; };
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { // split the servers set them into model->rpc_servers <...> model->rpc_servers.push_back(servers); }
for (const std::string & server : model->rpc_servers) { ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); <...> } }
// create list of devices to use with this model if (params.devices) { for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { model->devices.push_back(*dev); } } else { // use all available devices for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); <...> } }
// if using single GPU mode, remove all except the main GPU if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { <...> ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; model->devices.clear(); model->devices.push_back(main_gpu); }
for (auto * dev : model->devices) { <...> }
int status = llama_model_load(path_model, *model, params); <...>
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops // the tensor is duplicated // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor llm_tensor tn_tensor = tn.tensor; <...>
// tensors with "bias" suffix are always used with GGML_OP_ADD ggml_op op; bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0; <...>
// avoid using a host buffer when using mmap <...>
ggml_context * ctx = ctx_for_buft(buft);
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one if (flags & llama_model_loader::TENSOR_DUPLICATED) { ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); if (t) { return t; } } return ml.create_tensor(ctx, tn, ne, flags); };
// TODO: do not allocate each time std::vector<llama_token_data> cur; cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); }
// prepare a batch for the prompt llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); llama_token new_token_id; while (true) { // check if we have enough space in the context to evaluate this batch <...>
// sample the next token new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation? if (llama_token_is_eog(model, new_token_id)) { break; }
// convert the token to a string, print it and add it to the response <...> response += piece;
// prepare the next batch with the sampled token batch = llama_batch_get_one(&new_token_id, 1); }
// add the user input to the message list and format it messages.push_back({"user", strdup(user.c_str())}); int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); <...>
// remove previous messages to obtain the prompt to generate the response std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);