From f963b63afa0e057cfb9eba4d88407c6a0850a0d8 Mon Sep 17 00:00:00 2001 From: comex Date: Sat, 8 Apr 2023 12:24:37 -0700 Subject: [PATCH] Rewrite loading code to try to satisfy everyone: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak (for the bit I copied from #740) --- CMakeLists.txt | 9 +- Makefile | 4 +- examples/common.cpp | 9 +- examples/common.h | 1 + examples/embedding/embedding.cpp | 1 + examples/main/main.cpp | 1 + examples/perplexity/perplexity.cpp | 1 + examples/quantize-stats/quantize-stats.cpp | 9 +- ggml.c | 78 - ggml.h | 20 +- llama.cpp | 1497 ++++++++++---------- llama.h | 13 +- llama_internal.h | 12 + llama_util.h | 383 +++++ 14 files changed, 1209 insertions(+), 829 deletions(-) create mode 100644 llama_internal.h create mode 100755 llama_util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 07443e9..6bec1f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,7 @@ if (LLAMA_ALL_WARNINGS) -Wpedantic -Wcast-qual -Wno-unused-function + -Wno-multichar ) else() # todo : msvc @@ -152,6 +153,10 @@ if (LLAMA_ALL_WARNINGS) endif() +if (MSVC) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) +endif() + if (LLAMA_LTO) include(CheckIPOSupported) check_ipo_supported(RESULT result OUTPUT output) @@ -241,7 +246,9 @@ endif() add_library(llama llama.cpp - llama.h) + llama.h + llama_internal.h + llama_util.h) target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump diff --git a/Makefile b/Makefile index c55338e..3e58a28 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ LDFLAGS = # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function -CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function +CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific # TODO: support Windows @@ -142,7 +142,7 @@ default: main quantize perplexity embedding ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o -llama.o: llama.cpp llama.h +llama.o: llama.cpp llama.h llama_util.h llama_internal.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o common.o: examples/common.cpp examples/common.h diff --git a/examples/common.cpp b/examples/common.cpp index b27aa6c..f909eed 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -1,7 +1,5 @@ #include "common.h" -#include "ggml.h" - #include #include #include @@ -161,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_color = true; } else if (arg == "--mlock") { params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; } else if (arg == "--mtest") { params.mem_test = true; } else if (arg == "--verbose-prompt") { @@ -240,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - if (ggml_mlock_supported()) { + if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); diff --git a/examples/common.h b/examples/common.h index 7a8848f..1ea6f74 100644 --- a/examples/common.h +++ b/examples/common.h @@ -47,6 +47,7 @@ struct gpt_params { bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt + bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index d397f35..2eda3ac 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -38,6 +38,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d59eeb4..d333d0d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -97,6 +97,7 @@ int main(int argc, char ** argv) { lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; ctx = llama_init_from_file(params.model.c_str(), lparams); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 07ed0a8..b62f00d 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -115,6 +115,7 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mmap = params.use_mmap; lparams.use_mlock = params.use_mlock; lparams.embedding = params.embedding; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index af1e627..203bfe8 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" +#include "llama_internal.h" #include #include @@ -266,15 +267,13 @@ int main(int argc, char ** argv) { } } - // Sort tensors for consistent output - const auto tensors = llama_internal_get_tensor_map(ctx); - std::map tensors_sorted { tensors.begin(), tensors.end() }; + const auto &tensors = llama_internal_get_tensor_map(ctx); // check layer tensors int included_layers = 0; int64_t max_nelements = 0; bool is_f16 = false; - for (const auto& kv_tensor : tensors_sorted) { + for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -315,7 +314,7 @@ int main(int argc, char ** argv) { error_stats global_stats {}; - for (const auto& kv_tensor : tensors_sorted) { + for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } diff --git a/ggml.c b/ggml.c index dc084e6..326b8e8 100644 --- a/ggml.c +++ b/ggml.c @@ -97,17 +97,6 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif -#define GGML_MLOCK_SUPPORT 0 - -#ifdef __has_include - #if __has_include() - #undef GGML_MLOCK_SUPPORT - #define GGML_MLOCK_SUPPORT 1 - #include - #endif -#endif - - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); -// -// ggml object -// - -struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - char padding[8]; -}; - -static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -2716,7 +2690,6 @@ struct ggml_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; - bool mem_buffer_mlocked; bool no_alloc; int n_objects; @@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_size =*/ params.mem_size, /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, - /*.mem_buffer_mlocked =*/ false, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, @@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) { GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); -#if GGML_MLOCK_SUPPORT - if (ctx->mem_buffer_mlocked) { - if (munlock(ctx->mem_buffer, ctx->mem_size)) { - fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno)); - } - } -#endif - if (ctx->mem_buffer_owned) { free(ctx->mem_buffer); } @@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } -#ifdef __APPLE__ -#define MLOCK_SUGGESTION \ - "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" -#else -#define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" -#endif - -bool ggml_mlock_supported(void) { - return GGML_MLOCK_SUPPORT; -} - -bool ggml_mlock( - struct ggml_context * ctx, - const void *opt_extra_addr, - size_t opt_extra_len, - char **err_p) { - // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32 -#if GGML_MLOCK_SUPPORT - if (ctx->mem_buffer_mlocked) { - return true; - } - if (mlock(ctx->mem_buffer, ctx->mem_size) || - (opt_extra_len && - mlock(opt_extra_addr, opt_extra_len))) { - if ((*err_p = malloc(1024))) { - snprintf(*err_p, 1024, - "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION, - ctx->mem_size + opt_extra_len, - strerror(errno)); - } - return false; - } - ctx->mem_buffer_mlocked = true; - return true; -#else // GGML_MLOCK_SUPPORT - *err_p = strdup("can't mlock because it's not supported on this system"); - return false; -#endif // GGML_MLOCK_SUPPORT -} - //////////////////////////////////////////////////////////////////////////////// struct ggml_tensor * ggml_new_tensor_impl( diff --git a/ggml.h b/ggml.h index 2c636c2..af16c64 100644 --- a/ggml.h +++ b/ggml.h @@ -253,6 +253,19 @@ enum ggml_op { GGML_OP_COUNT, }; + +// ggml object +struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + char padding[8]; +}; + +static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); -bool ggml_mlock_supported(void); -bool ggml_mlock( - struct ggml_context * ctx, - const void *opt_extra_addr, - size_t opt_extra_len, - char **err_p); - struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, diff --git a/llama.cpp b/llama.cpp index fc6f43a..4625e95 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,49 +1,26 @@ +#include "llama_util.h" #include "llama.h" +#include "llama_internal.h" #include "ggml.h" +#include #include #include #include #include #include #include -#include #include #include - -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) -#define WIN32_LEAN_AND_MEAN -#include -#else -#include -#include -#include -#include -#endif - -#define Min(X, Y) ((Y) > (X) ? (X) : (Y)) -#define Max(X, Y) ((Y) < (X) ? (X) : (Y)) +#include +#include +#include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 -#define LLAMA_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - - -// determine number of model parts based on the dimension -static const std::unordered_map LLAMA_N_PARTS = { - { 4096, 1 }, - { 5120, 2 }, - { 6656, 4 }, - { 8192, 8 }, -}; // available llama models enum e_model { @@ -93,14 +70,18 @@ static const std::map MEM_REQ_EVAL = { // default hparams (LLaMA 7B) struct llama_hparams { - int32_t n_vocab = 32000; - int32_t n_ctx = 512; // this is provided as user input? - int32_t n_embd = 4096; - int32_t n_mult = 256; - int32_t n_head = 32; - int32_t n_layer = 32; - int32_t n_rot = 64; - int32_t f16 = 1; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + uint32_t f16 = 1; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } }; struct llama_layer { @@ -126,11 +107,17 @@ struct llama_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; - std::vector buf; + llama_buffer buf; int n; // number of tokens currently in the cache + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_model { @@ -146,22 +133,30 @@ struct llama_model { std::vector layers; // context - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; // key + value cache for the self attention // TODO: move to llama_state struct llama_kv_cache kv_self; // the model memory buffer - std::vector buf; + llama_buffer buf; // model memory mapped file - void * mm_addr = NULL; - uint64_t mm_length = 0; + std::unique_ptr mapping; - // tensors - int n_loaded; - std::unordered_map tensors; + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + ~llama_model() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_vocab { @@ -206,8 +201,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - std::vector buf_compute; - std::vector buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_buffer buf_compute; + llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -220,11 +215,11 @@ struct llama_context { last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); } else { auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), }); + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); } if (buf_last >= 0) { - buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size); + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); } buf_last = i; @@ -244,6 +239,508 @@ struct llama_context { } }; +template +static T checked_mul(T a, T b) { + T ret = a * b; + if (a != 0 && ret / a != b) { + throw format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b); + } + return ret; +} + +static size_t checked_div(size_t a, size_t b) { + if (b == 0 || a % b != 0) { + throw format("error dividing %zu / %zu", a, b); + } + return a / b; +} + +static std::string llama_format_tensor_shape(const std::vector & ne) { + std::string ret = "[" + std::to_string(ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + ret += " x " + std::to_string(ne.at(i)); + } + ret += "]"; + return ret; +} + +static const char * llama_format_type(enum ggml_type type) { + switch (type) { + case GGML_TYPE_F32: return "f32"; + case GGML_TYPE_F16: return "f16"; + case GGML_TYPE_Q4_0: return "q4_0"; + case GGML_TYPE_Q4_1: return "q4_1"; + default: LLAMA_ASSERT(false); + } +} + +static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + for (uint32_t dim : ne) { + size = checked_mul(size, dim); + } + return size / ggml_blck_size(type); +} + +struct llama_load_tensor_shard { + std::vector ne; + size_t size; + enum ggml_type type; + size_t file_idx; + size_t file_off; + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +enum llama_split_type { + SPLIT_NONE, + SPLIT_BY_COLUMNS, + SPLIT_BY_ROWS +}; + +struct llama_load_tensor { + std::vector shards; + + std::string name; + enum ggml_type type = GGML_TYPE_F32; + llama_split_type split_type = SPLIT_NONE; + std::vector ne; + size_t size; + struct ggml_tensor * ggml_tensor = NULL; + uint8_t * data; + + llama_load_tensor(const std::string & name) : name(name) {} + + void calc_all() { + calc_type(); + calc_split_type(); + calc_ne(); + calc_size(); + } + + void calc_type() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.type != first_shard.type) { + throw format("inconsistent tensor shard type in '%s'", name.c_str()); + } + } + type = first_shard.type; + } + + void calc_split_type() { + if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file + shards.size() == 1) { // only one file? + split_type = SPLIT_NONE; + } else if (name.find("tok_embeddings.") == 0 || + name.find(".attention.wo.weight") != std::string::npos || + name.find(".feed_forward.w2.weight") != std::string::npos) { + split_type = SPLIT_BY_COLUMNS; + } else { + split_type = SPLIT_BY_ROWS; + } + } + + void calc_ne() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.ne != first_shard.ne) { + throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + } + } + ne = first_shard.ne; + LLAMA_ASSERT(shards.size() <= UINT32_MAX); + uint32_t n_shards = (uint32_t) shards.size(); + switch (split_type) { + case SPLIT_NONE: + ne = first_shard.ne; + break; + case SPLIT_BY_COLUMNS: + ne = {checked_mul(first_shard.ne[0], n_shards), + first_shard.ne[1]}; + break; + case SPLIT_BY_ROWS: + ne = {first_shard.ne[0], + checked_mul(first_shard.ne[1], n_shards)}; + break; + } + } + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +struct llama_load_tensors_map { + // tensors is kept in a separate vector to preserve file order + std::vector tensors; + std::unordered_map name_to_idx; +}; + +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding +}; + +struct llama_file_loader { + llama_file file; + llama_file_version file_version; + llama_hparams hparams; + llama_vocab vocab; + + llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + : file(fname, "rb") { + fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + read_magic(); + read_hparams(); + read_vocab(); + read_tensor_metadata(file_idx, tensors_map); + } + void read_magic() { + uint32_t magic = file.read_u32(); + uint32_t version = 0; + + if (magic != 'ggml') { + version = file.read_u32(); + } + + if (magic == 'ggml' && version == 0) { + file_version = LLAMA_FILE_VERSION_GGML; + } else if (magic == 'ggmf' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGMF_V1; + } else if (magic == 'ggjt' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGJT_V1; + } else { + throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version); + } + } + void read_hparams() { + hparams.n_vocab = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); + hparams.n_layer = file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.f16 = file.read_u32(); + } + void read_vocab() { + vocab.id_to_token.resize(hparams.n_vocab); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + uint32_t len = file.read_u32(); + std::string word = file.read_string(len); + + float score = 0.0f; + if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + file.read_raw(&score, sizeof(score)); + } + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = score; + } + } + void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + while (file.tell() < file.size) { + llama_load_tensor_shard shard; + uint32_t n_dims = file.read_u32(); + uint32_t name_len = file.read_u32(); + uint32_t ftype = file.read_u32(); + shard.ne.resize(n_dims); + file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + std::string name = file.read_string(name_len); + if (n_dims < 1 || n_dims > 2) { + throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + } + switch (ftype) { + case 0: shard.type = GGML_TYPE_F32; break; + case 1: shard.type = GGML_TYPE_F16; break; + case 2: shard.type = GGML_TYPE_Q4_0; break; + case 3: shard.type = GGML_TYPE_Q4_1; break; + default: { + throw format("unrecognized ftype %u\n", ftype); + } + } + + if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + // skip to the next multiple of 32 bytes + file.seek(-file.tell() & 31, SEEK_CUR); + } + shard.file_idx = file_idx; + shard.file_off = file.tell(); + + shard.calc_size(); + file.seek(shard.size, SEEK_CUR); + + auto it = tensors_map.name_to_idx.find(name); + size_t idx; + if (it != tensors_map.name_to_idx.end()) { + idx = it->second; + } else { + tensors_map.tensors.emplace_back(name); + idx = tensors_map.tensors.size() - 1; + tensors_map.name_to_idx.emplace(name, idx); + } + tensors_map.tensors.at(idx).shards.push_back(shard); + } + } +}; + +struct llama_file_saver { + llama_file file; + llama_file_loader * any_file_loader; + llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16) + : file(fname, "wb"), any_file_loader(any_file_loader) { + fprintf(stderr, "llama.cpp: saving model to %s\n", fname); + write_magic(); + write_hparams(new_f16); + write_vocab(); + } + void write_magic() { + file.write_u32('ggjt'); // magic + file.write_u32(1); // version + } + void write_hparams(uint32_t new_f16) { + const llama_hparams & hparams = any_file_loader->hparams; + file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_embd); + file.write_u32(hparams.n_mult); + file.write_u32(hparams.n_head); + file.write_u32(hparams.n_layer); + file.write_u32(hparams.n_rot); + file.write_u32(new_f16); + } + void write_vocab() { + if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { + fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + } + uint32_t n_vocab = any_file_loader->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = any_file_loader->vocab.id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + } + void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + uint32_t ftype; + switch (new_type) { + case GGML_TYPE_F32: ftype = 0; break; + case GGML_TYPE_F16: ftype = 1; break; + case GGML_TYPE_Q4_0: ftype = 2; break; + case GGML_TYPE_Q4_1: ftype = 3; break; + default: LLAMA_ASSERT(false); + } + file.write_u32((uint32_t) tensor.ne.size()); + file.write_u32((uint32_t) tensor.name.size()); + file.write_u32(ftype); + file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); + file.write_raw(tensor.name.data(), tensor.name.size()); + file.seek(-file.tell() & 31, SEEK_CUR); + LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + file.write_raw(new_data, new_size); + } +}; + +struct llama_model_loader { + std::vector> file_loaders; + llama_load_tensors_map tensors_map; + bool use_mmap; + size_t num_ggml_tensors_created = 0; + struct ggml_context * ggml_ctx = NULL; + std::unique_ptr mapping; + + llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + file_loaders.emplace_back(first_file); + uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); + for (uint32_t i = 1; i < n_parts; i++) { + std::string fname = fname_base + "." + std::to_string(i); + auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + file_loaders.emplace_back(ith_file); + if (ith_file->hparams != first_file->hparams) { + throw format("llama.cpp: hparams inconsistent between files"); + } + } + if (!llama_mmap::SUPPORTED) { + use_mmap = false; + } + if (use_mmap && alignment_prevents_mmap()) { + fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); + use_mmap = false; + } + this->use_mmap = use_mmap; + for (llama_load_tensor & lt : tensors_map.tensors) { + lt.calc_all(); + } + } + + bool alignment_prevents_mmap() { + for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const llama_load_tensor_shard & shard : lt.shards) { + if (shard.file_off & 3) { + return true; + } + } + } + return false; + } + + uint32_t guess_n_parts() const { + auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); + if (it == tensors_map.name_to_idx.end()) { + throw std::string("missing tok_embeddings.weight"); + } + const llama_load_tensor & lt = tensors_map.tensors.at(it->second); + return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); + } + + void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { + *ctx_size_p = *mmapped_size_p = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + } + } + + struct ggml_tensor * get_tensor(const std::string & name, std::vector ne) { + auto it = tensors_map.name_to_idx.find(name); + if (it == tensors_map.name_to_idx.end()) { + throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); + } + llama_load_tensor & lt = tensors_map.tensors.at(it->second); + if (lt.ne != ne) { + throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + } + return get_tensor_for(lt); + } + + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * tensor; + if (lt.ne.size() == 2) { + tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + } else { + LLAMA_ASSERT(lt.ne.size() == 1); + tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + lt.ggml_tensor = tensor; + num_ggml_tensors_created++; + return tensor; + } + + void done_getting_tensors() { + if (num_ggml_tensors_created != tensors_map.tensors.size()) { + throw std::string("llama.cpp: file contained more tensors than expected"); + } + } + + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t data_size = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + data_size += lt.size; + } + + if (use_mmap) { + mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + if (!lmlock) { + // Don't call the callback since the actual loading will be lazy + // and we can't measure it. + progress_callback = NULL; + } + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (llama_load_tensor & lt : tensors_map.tensors) { + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_tensor->data; + load_data_for(lt); + lt.ggml_tensor->data = lt.data; + done_size += lt.size; + if (use_mmap && lmlock) { + lmlock->grow_to(done_size); + } + } + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + } + + void load_data_for(llama_load_tensor & lt) { + if (use_mmap) { + LLAMA_ASSERT(lt.shards.size() == 1); + lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; + } else if (lt.split_type == SPLIT_NONE) { + llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + file.seek(lt.shards.at(0).file_off, SEEK_SET); + file.read_raw(lt.data, lt.size); + } else if (lt.split_type == SPLIT_BY_ROWS) { + size_t offset = 0; + for (llama_load_tensor_shard & shard : lt.shards) { + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + file.read_raw(lt.data + offset, shard.size); + offset += shard.size; + } + LLAMA_ASSERT(offset == lt.size); + } else if (lt.split_type == SPLIT_BY_COLUMNS) { + // Let's load the data into temporary buffers to ensure the OS performs large loads. + std::vector tmp_bufs; + tmp_bufs.resize(lt.shards.size()); + for (size_t i = 0; i < lt.shards.size(); i++) { + llama_load_tensor_shard & shard = lt.shards.at(i); + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + tmp_bufs.at(i).resize(shard.size); + file.read_raw(tmp_bufs.at(i).addr, shard.size); + } + // Then reshape. + size_t num_rows = lt.ne.at(1); + size_t per_shard_row_size = lt.shards.at(0).size / num_rows; + size_t out_offset = 0; + for (size_t row = 0; row < num_rows; row++) { + for (llama_buffer & tmp_buf : tmp_bufs) { + memcpy(lt.data + out_offset, + tmp_buf.addr + row * per_shard_row_size, + per_shard_row_size); + out_offset += per_shard_row_size; + } + } + LLAMA_ASSERT(out_offset == lt.size); + } + if (0) { + print_checksum(lt); + } + } + + static void print_checksum(llama_load_tensor & lt) { + uint32_t sum = 0; + for (size_t i = 0; i < lt.size; i++) { + uint8_t byte = lt.data[i]; + sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + } + fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + llama_format_tensor_shape(lt.ne).c_str(), lt.size); + } + +}; + + // // kv cache // @@ -262,8 +759,8 @@ static bool kv_cache_init( cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; - params.mem_size = cache.buf.size(); - params.mem_buffer = cache.buf.data(); + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.addr; params.no_alloc = false; cache.ctx = ggml_init(params); @@ -279,13 +776,6 @@ static bool kv_cache_init( return true; } -static void kv_cache_free(struct llama_kv_cache & cache) { - if (cache.ctx) { - ggml_free(cache.ctx); - cache.ctx = nullptr; - } -} - struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, @@ -294,6 +784,7 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, + /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, /*.progress_callback =*/ nullptr, @@ -303,243 +794,71 @@ struct llama_context_params llama_context_default_params() { return result; } -// -// model loading -// - -static void *mmap_file(const char *fname, uint64_t *mm_length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - HANDLE hFile = CreateFileA(fname, - GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, - NULL); - if (hFile == INVALID_HANDLE_VALUE) return 0; - LARGE_INTEGER fileSize; - fileSize.QuadPart = -1; - GetFileSizeEx(hFile, &fileSize); - int64_t length = fileSize.QuadPart; - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - CloseHandle(hFile); - if (!hMapping) return 0; - void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - CloseHandle(hMapping); - if (!addr) return 0; -#else - int fd = open(fname, O_RDONLY); - if (fd == -1) return 0; - int64_t length = lseek(fd, 0, SEEK_END); - void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); - close(fd); - if (addr == MAP_FAILED) return 0; -#endif - *mm_length = length; - return addr; +bool llama_mmap_supported() { + return llama_mmap::SUPPORTED; } -static void munmap_file(void * addr, size_t length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - UnmapViewOfFile(addr); -#else - munmap(addr, length); -#endif +bool llama_mlock_supported() { + return llama_mlock::SUPPORTED; } -static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) { - fprintf(stderr, - "%s: invalid model file (bad magic [got %#x want %#x])\n" - "\tyou most likely need to regenerate your ggml files\n" - "\tthe benefit is you'll get 10-100x faster load times\n" - "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n" - "\tuse convert-pth-to-ggml.py to regenerate from original pth\n" - "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n", - path, got, want); - return false; -} +// +// model loading +// -static bool llama_model_load( +static void llama_model_load_internal( const std::string & fname, llama_context & lctx, int n_ctx, - int n_parts, ggml_type memory_type, + bool use_mmap, + bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, - void *progress_callback_user_data) { - fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + void * progress_callback_user_data) { lctx.t_start_us = ggml_time_us(); - auto & model = lctx.model; - auto & vocab = lctx.vocab; + std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - std::vector f_buf(1024*1024); - fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); - - fin.seekg(0, fin.end); - const size_t file_size = fin.tellg(); - fin.seekg(0); + lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); + auto & model = lctx.model; + auto & hparams = model.hparams; + hparams = ml->file_loaders.at(0)->hparams; + uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - // verify magic { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n", - __func__, fname.c_str()); - return false; - } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = e_model::MODEL_65B; break; } - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - } - - int n_ff = 0; - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); - hparams.n_ctx = n_ctx; - n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - - if (n_parts < 1) { - n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - } - - // temp warning to tell the user to use "--n_parts" - if (hparams.f16 == 4 && n_parts != 1) { - fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts); - fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); - } - - if (hparams.n_layer == 32) { - model.type = e_model::MODEL_7B; - } - - if (hparams.n_layer == 40) { - model.type = e_model::MODEL_13B; - } - - if (hparams.n_layer == 60) { - model.type = e_model::MODEL_30B; - } - - if (hparams.n_layer == 80) { - model.type = e_model::MODEL_65B; - } - - fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); - fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); - fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); - fprintf(stderr, "%s: type = %d\n", __func__, model.type); - } - - // load vocab - { - std::string word; - vocab.id_to_token.resize(model.hparams.n_vocab); - std::vector tmp(64); - - for (int i = 0; i < model.hparams.n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - word.resize(len); - if (len > 0) { - tmp.resize(len); - fin.read(tmp.data(), len); - word.assign(tmp.data(), len); - } else { - word.clear(); - } - - float score; - fin.read((char *) &score, sizeof(score)); - - vocab.token_to_id[word] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word; - tok_score.score = score; - } + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); + fprintf(stderr, "%s: type = %u\n", __func__, model.type); } if (vocab_only) { - return true; - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - // wtype is for per-layer weights, while vtype is for other weights - ggml_type wtype, vtype; - switch (model.hparams.f16) { - case 0: wtype = vtype = GGML_TYPE_F32; break; - case 1: wtype = vtype = GGML_TYPE_F16; break; - case 2: wtype = vtype = GGML_TYPE_Q4_0; break; - case 3: wtype = vtype = GGML_TYPE_Q4_1; break; - case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); - return false; - } - } - - // map model into memory - char *mm_addr = NULL; - model.mm_addr = mmap_file(fname.c_str(), &model.mm_length); - if (model.mm_addr == NULL) { - fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str()); - return false; + return; } - mm_addr = (char *)model.mm_addr; - fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0)); auto & ctx = model.ctx; - size_t ctx_size = 0; - { - const auto &hparams = model.hparams; - const int n_layer = hparams.n_layer; - ctx_size += (5 + 10*n_layer)*256; // object overhead - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - } + size_t ctx_size, mmapped_size; + ml->calc_sizes(&ctx_size, &mmapped_size); + fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); // print memory requirements { @@ -548,7 +867,7 @@ static bool llama_model_load( // this is the total memory required to run the inference const size_t mem_required = ctx_size + - model.mm_length + + mmapped_size + MEM_REQ_SCRATCH0.at(model.type) + MEM_REQ_SCRATCH1.at(model.type) + MEM_REQ_EVAL.at (model.type); @@ -564,17 +883,20 @@ static bool llama_model_load( // create the ggml context { lctx.model.buf.resize(ctx_size); + if (use_mlock) { + lctx.model.mlock_buf.init(lctx.model.buf.addr); + lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size(), - /*.mem_buffer =*/ lctx.model.buf.data(), - /*.no_alloc =*/ true, + /*.mem_size =*/ lctx.model.buf.size, + /*.mem_buffer =*/ lctx.model.buf.addr, + /*.no_alloc =*/ ml->use_mmap, }; model.ctx = ggml_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; + throw format("ggml_init() failed"); } } @@ -582,161 +904,71 @@ static bool llama_model_load( { const auto & hparams = model.hparams; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); - - model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; - // map by name - model.tensors["tok_embeddings.weight"] = model.tok_embeddings; + ml->ggml_ctx = ctx; - model.tensors["norm.weight"] = model.norm; - model.tensors["output.weight"] = model.output; + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); + model.norm = ml->get_tensor("norm.weight", {n_embd}); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); - for (int i = 0; i < n_layer; ++i) { + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + std::string layers_i = "layers." + std::to_string(i); - layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); - layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; - - model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); } } - std::vector tmp; + ml->done_getting_tensors(); - if (progress_callback) { - progress_callback(0.0, progress_callback_user_data); + // populate `tensors_by_name` + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } - fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str()); - - // load weights - { - size_t total_size = 0; - model.n_loaded = 0; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return false; - } - - auto tensor = model.tensors[name.data()]; - - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return false; - } - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); - return false; - } - if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - switch (ftype) { - case 0: // f32 - case 1: // f16 - break; - case 2: // q4_0 - case 3: // q4_1 - assert(ne[0] % 64 == 0); - break; - default: - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); - return false; - }; - - // load the tensor data into memory without copying or reading it - size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(tensor); - offset = (offset + 31) & -32; - tensor->data = mm_addr + offset; - fin.seekg(offset + tensor_data_size); - total_size += tensor_data_size; - model.n_loaded++; - - // progress - if (progress_callback) { - double current_progress = size_t(fin.tellg()) / double(file_size); - progress_callback(current_progress, progress_callback_user_data); - } - } - - fin.close(); - - fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); - if (model.n_loaded == 0) { - fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); - } else if (model.n_loaded != (int) model.tensors.size()) { - fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); - return false; - } - } + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration lctx.t_load_us = ggml_time_us() - lctx.t_start_us; +} - if (progress_callback) { - progress_callback(1.0, progress_callback_user_data); +static bool llama_model_load( + const std::string & fname, + llama_context & lctx, + int n_ctx, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::string & err) { + fprintf(stderr, "error loading model: %s\n", err.c_str()); + return false; } - - return true; } // evaluate the transformer @@ -774,8 +1006,8 @@ static bool llama_eval_internal( auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size(), - /*.mem_buffer =*/ buf_compute.data(), + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; @@ -1061,7 +1293,7 @@ struct llama_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; - size_t char_len = Min(text.size() - offs, utf8_len(text[offs])); + size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; @@ -1236,7 +1468,7 @@ static llama_vocab::id llama_sample_top_p_top_k( } } - sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits); + sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits); // compute probs for the top k tokens std::vector probs; @@ -1284,298 +1516,118 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -// TODO: reuse code from the llama_model_load() somehow -static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { - ggml_type type = GGML_TYPE_Q4_1; - +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { + ggml_type quantized_type; switch (itype) { - case 2: type = GGML_TYPE_Q4_0; break; - case 3: type = GGML_TYPE_Q4_1; break; - default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; + case 2: quantized_type = GGML_TYPE_Q4_0; break; + case 3: quantized_type = GGML_TYPE_Q4_1; break; + default: throw format("invalid quantization type %d\n", itype); }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { - fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); - return false; - } - - llama_vocab vocab; - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", - __func__, fname_inp.c_str()); - return false; - } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC); - } - - fout.write((char *) &magic, sizeof(magic)); - - uint32_t format_version; - finp.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - - fout.write((char *) &format_version, sizeof(format_version)); - } - - llama_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_mult = %d\n", __func__, hparams.n_mult); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &itype, sizeof(hparams.f16)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - if (n_vocab != hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); - return false; - } - - std::vector word(32); - vocab.id_to_token.resize(n_vocab); - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) &word[0], len); - fout.write((char *) &word[0], len); - - float score; - finp.read ((char *) &score, sizeof(score)); - fout.write((char *) &score, sizeof(score)); - - vocab.token_to_id[word.data()] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word.data(); - tok_score.score = score; - } - } - - // load weights - { - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector work; - - std::vector data_u8; - std::vector data_f16; - std::vector data_f32; - - std::vector hist_all(1 << 4, 0); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - finp.read(reinterpret_cast(&length), sizeof(length)); - finp.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (finp.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - finp.read (&name[0], length); - - { - // ensure tensor data is aligned - uint64_t offset = finp.tellg(); - offset = (offset + 31) & -32; - finp.seekg(offset); - } - - { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - // regexes of tensor names to be quantized - const std::vector k_names = { - ".*weight", - }; - - bool quantize = false; - for (const auto & s : k_names) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; - } - } - - // quantize only 2D tensors - quantize &= (n_dims == 2); - - if (quantize) { - if (ftype != 0 && ftype != 1) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); - return false; - } - - if (ftype == 1) { - data_f16.resize(nelements); - finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data_f32.resize(nelements); - for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); - } - } else { - data_f32.resize(nelements); - finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); + std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, + /*vocab_only*/ false)); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype); + + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector hist_all(1 << 4, 0); + + size_t idx = 0; + for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); + + printf("[%zu/%zu] %36s - %s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), + llama_format_type(tensor.type)); + + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? + + // quantize only 2D tensors + quantize &= (tensor.ne.size() == 2); + + enum ggml_type new_type; + void * new_data; + size_t new_size; + llama_buffer work; + + if (!quantize) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else { + new_type = quantized_type; + float * f32_data; + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + llama_buffer f32_conv_buf; + if (tensor.type == GGML_TYPE_F32) { + f32_data = (float *) tensor.data; + } else if (tensor.type == GGML_TYPE_F16) { + f32_conv_buf.resize(nelements * sizeof(float)); + f32_data = (float *) f32_conv_buf.addr; + auto f16_data = (const ggml_fp16_t *) tensor.data; + for (size_t i = 0; i < nelements; i++) { + f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } - - ftype = itype; } else { - const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); - - data_u8.resize(nelements*bpe); - finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); + throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type)); } - fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); - fout.write(reinterpret_cast(&length), sizeof(length)); - fout.write(reinterpret_cast(&ftype), sizeof(ftype)); - for (int i = 0; i < n_dims; ++i) { - fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); + printf("quantizing .. "); + fflush(stdout); + + work.resize(nelements * 4); // upper bound on size + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + + switch (new_type) { + case GGML_TYPE_Q4_0: + { + new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + default: + LLAMA_ASSERT(false); } - fout.write(&name[0], length); - { - // ensure tensor data is aligned - uint64_t offset = fout.tellp(); - offset = (offset + 31) & -32; - fout.seekp(offset); + printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; } - if (quantize) { - printf("quantizing .. "); - work.resize(nelements); // for quantization - - size_t cur_size = 0; - std::vector hist_cur(1 << 4, 0); - - switch (type) { - case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - default: - { - fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); - return false; - } - } - - fout.write(reinterpret_cast(work.data()), cur_size); - total_size_new += cur_size; - - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / float(nelements)); - } - printf("\n"); - } else { - printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); - fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); - total_size_new += data_u8.size(); + for (size_t i = 0; i < hist_cur.size(); i++) { + printf("%5.3f ", hist_cur[i] / float(nelements)); } - - total_size_org += nelements * sizeof(float); + printf("\n"); } + total_size_org += tensor.size; + total_size_new += new_size; + file_saver.write_tensor(tensor, new_type, new_data, new_size); + } - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / float(sum_all)); - } - printf("\n"); + printf("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + printf("%5.3f ", hist_all[i] / float(sum_all)); } + printf("\n"); } - - finp.close(); - fout.close(); - - return true; } // @@ -1593,32 +1645,36 @@ struct llama_context * llama_init_from_file( params.seed = time(NULL); } + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + ++*cur_percentage_p; + fprintf(stderr, "."); + fflush(stderr); + if (percentage >= 100) { + fprintf(stderr, "\n"); + } + } + }; + } + ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type, - params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); llama_free(ctx); return nullptr; } - if (params.use_mlock) { - char *err; - if (!ggml_mlock(ctx->model.ctx, - ctx->model.mm_addr, - ctx->model.mm_length, - &err)) { - fprintf(stderr, "%s\n", err); - free(err); - llama_free(ctx); - return nullptr; - } - } - // reserve memory for context buffers if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { @@ -1655,16 +1711,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - kv_cache_free(ctx->model.kv_self); - - if (ctx->model.ctx) { - ggml_free(ctx->model.ctx); - } - - if (ctx->model.mm_addr) { - munmap_file(ctx->model.mm_addr, ctx->model.mm_length); - } - delete ctx; } @@ -1672,23 +1718,24 @@ int llama_model_quantize( const char * fname_inp, const char * fname_out, int itype) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { - fprintf(stderr, "%s: failed to quantize\n", __func__); + try { + llama_model_quantize_internal(fname_inp, fname_out, itype); + return 0; + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); return 1; } - - return 0; } // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { - return ctx->model.kv_self.buf.data(); + return ctx->model.kv_self.buf.addr; } // Returns the size of the KV cache size_t llama_get_kv_cache_size(struct llama_context * ctx) { - return ctx->model.kv_self.buf.size(); + return ctx->model.kv_self.buf.size; } int llama_get_kv_cache_token_count(struct llama_context * ctx) { @@ -1702,8 +1749,8 @@ void llama_set_kv_cache( size_t n_size, int n_token_count) { // Make sure we have the same kv cache setup - LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); - memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); + LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size); + memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size); ctx->model.kv_self.n = n_token_count; } @@ -1814,9 +1861,9 @@ llama_token llama_sample_top_p_top_k( void llama_print_timings(struct llama_context * ctx) { const int64_t t_end_us = ggml_time_us(); - const int32_t n_sample = Max(1, ctx->n_sample); - const int32_t n_eval = Max(1, ctx->n_eval); - const int32_t n_p_eval = Max(1, ctx->n_p_eval); + const int32_t n_sample = std::max(1, ctx->n_sample); + const int32_t n_eval = std::max(1, ctx->n_eval); + const int32_t n_p_eval = std::max(1, ctx->n_p_eval); fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); @@ -1854,6 +1901,6 @@ const char * llama_print_system_info(void) { } // For internal test use -std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx) { - return ctx->model.tensors; +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { + return ctx->model.tensors_by_name; } diff --git a/llama.h b/llama.h index deb09fe..42c364c 100644 --- a/llama.h +++ b/llama.h @@ -55,6 +55,7 @@ extern "C" { bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only @@ -66,6 +67,9 @@ extern "C" { LLAMA_API struct llama_context_params llama_context_default_params(); + LLAMA_API bool llama_mmap_supported(); + LLAMA_API bool llama_mlock_supported(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure @@ -164,13 +168,6 @@ extern "C" { #ifdef __cplusplus } - -#include -#include -// -// Internal function exposed for tests and benchmarks -// -std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx); #endif -#endif +#endif // LLAMA_H diff --git a/llama_internal.h b/llama_internal.h new file mode 100644 index 0000000..543eed9 --- /dev/null +++ b/llama_internal.h @@ -0,0 +1,12 @@ +// Internal header to be included by llama.cpp and tests/benchmarks only. + +#ifndef LLAMA_INTERNAL_H +#define LLAMA_INTERNAL_H + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif // LLAMA_INTERNAL_H diff --git a/llama_util.h b/llama_util.h new file mode 100755 index 0000000..d68f49b --- /dev/null +++ b/llama_util.h @@ -0,0 +1,383 @@ +// Internal header to be included only by llama.cpp. +// Contains wrappers around OS interfaces. + +#ifndef LLAMA_UTIL_H +#define LLAMA_UTIL_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #include + #include // for _fseeki64 +#endif + +#define LLAMA_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static std::string format(const char * fmt, ...) { + va_list ap, ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + LLAMA_ASSERT(size >= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + LLAMA_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +}; + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw format("failed to open %s: %s", fname, std::strerror(errno)); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + LLAMA_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + LLAMA_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + throw format("read error: %s", strerror(errno)); + } + if (ret != 1) { + throw std::string("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + throw format("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +#if defined(_WIN32) +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; +#ifdef __linux__ + flags |= MAP_POPULATE; +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + throw format("mmap failed: %s", strerror(errno)); + } + + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + + ~llama_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file) { + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + CloseHandle(hFile); + + if (hMapping == NULL) { + throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); + } + + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + + ~llama_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_mmap(struct llama_file *) { + throw std::string("mmap not supported"); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + + llama_mlock() {} + llama_mlock(const llama_mlock &) = delete; + + ~llama_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * addr) { + LLAMA_ASSERT(this->addr == NULL && this->size == 0); + this->addr = addr; + } + + void grow_to(size_t target_size) { + LLAMA_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) { + if (!mlock(addr, size)) { + return true; + } else { + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION, + size, this->size, std::strerror(errno)); + return false; + } + } + + #undef MLOCK_SUGGESTION + + void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * addr, size_t size) { + for (int tries = 1; ; tries++) { + if (VirtualLock(addr, size)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + size, this->size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = size + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += size; + max_ws_size += size; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + void raw_unlock(void * addr, size_t size) { + if (!VirtualUnlock(addr, size)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + void raw_lock(const void * addr, size_t size) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + } + + void raw_unlock(const void * addr, size_t size) {} +#endif +}; + +// Replacement for std::vector that doesn't require zero-initialization. +struct llama_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + void resize(size_t size) { + delete[] addr; + addr = new uint8_t[size]; + this->size = size; + } + + ~llama_buffer() { + delete[] addr; + } +}; +#endif