From 353ec251a42491f5192c48561da4b444ef67f23c Mon Sep 17 00:00:00 2001 From: "Fabio R. Sluzala" Date: Tue, 21 Mar 2023 14:21:50 -0300 Subject: [PATCH] We could use std::unordered_map over std::map (#305) * Improve performance by changing std::map to std::unordered_map and std::map id_to_token; to std::vector id_to_token; * fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size()); * Removed include * Nest struct token score inside gpt_vocab * renamed token to tok --- main.cpp | 18 ++++++++++-------- quantize.cpp | 8 +++++--- utils.cpp | 20 ++++++++++++-------- utils.h | 14 +++++++++----- 4 files changed, 36 insertions(+), 24 deletions(-) diff --git a/main.cpp b/main.cpp index 43b82b1..fe9e583 100644 --- a/main.cpp +++ b/main.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -69,7 +68,7 @@ void set_console_state(console_state new_st) static const int EOS_TOKEN_ID = 2; // determine number of model parts based on the dimension -static const std::map LLAMA_N_PARTS = { +static const std::unordered_map LLAMA_N_PARTS = { { 4096, 1 }, { 5120, 2 }, { 6656, 4 }, @@ -123,7 +122,7 @@ struct llama_model { // struct ggml_context * ctx; - std::map tensors; + std::unordered_map tensors; }; // load the model's weights from a file @@ -208,6 +207,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, llama_voca // load vocab { std::string word; + vocab.id_to_token.resize(model.hparams.n_vocab); std::vector tmp(64); for (int i = 0; i < model.hparams.n_vocab; i++) { @@ -227,8 +227,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, llama_voca fin.read((char *) &score, sizeof(score)); vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - vocab.score[i] = score; + + auto &tok_score = vocab.id_to_token[i]; + tok_score.tok = word; + tok_score.score = score; } } @@ -1028,7 +1030,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).tok.c_str()); } fprintf(stderr, "\n"); if (params.interactive) { @@ -1154,7 +1156,7 @@ int main(int argc, char ** argv) { // display text if (!input_noecho) { for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); + printf("%s", vocab.id_to_token[id].tok.c_str()); } fflush(stdout); } @@ -1169,7 +1171,7 @@ int main(int argc, char ** argv) { // check for reverse prompt std::string last_output; for (auto id : last_n_tokens) { - last_output += vocab.id_to_token[id]; + last_output += vocab.id_to_token[id].tok; } // Check if each of the reverse prompts appears at the end of the output. diff --git a/quantize.cpp b/quantize.cpp index b90f34f..52b7ac9 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -130,6 +129,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna } std::string word; + vocab.id_to_token.resize(n_vocab); for (int i = 0; i < n_vocab; i++) { uint32_t len; finp.read ((char *) &len, sizeof(len)); @@ -144,8 +144,10 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna fout.write((char *) &score, sizeof(score)); vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - vocab.score[i] = score; + + auto &tok_score = vocab.id_to_token[i]; + tok_score.tok = word; + tok_score.score = score; } } diff --git a/utils.cpp b/utils.cpp index 7c6864c..b15c68a 100644 --- a/utils.cpp +++ b/utils.cpp @@ -155,8 +155,8 @@ void replace(std::string & str, const std::string & needle, const std::string & } } -std::map json_parse(const std::string & fname) { - std::map result; +std::unordered_map json_parse(const std::string & fname) { + std::unordered_map result; // read file into string std::string json; @@ -360,16 +360,16 @@ private: return; } - auto score = vocab_.score.find((*token).second); - - if (score == vocab_.score.end()) { + if (static_cast((*token).second) >= vocab_.id_to_token.size()) { return; } + const auto &tok_score = vocab_.id_to_token[(*token).second]; + llama_sp_bigram bigram; bigram.left = left; bigram.right = right; - bigram.score = (*score).second; + bigram.score = tok_score.score; bigram.size = text.size(); work_queue_.push(bigram); } @@ -393,6 +393,8 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) { std::string word; std::vector tmp(64); + vocab.id_to_token.resize(n_vocab); + for (int i = 0; i < n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); @@ -410,8 +412,10 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) { fin.read((char *) &score, sizeof(score)); vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - vocab.score[i] = score; + + auto &tok_score = vocab.id_to_token[i]; + tok_score.tok = word; + tok_score.score = score; } return true; diff --git a/utils.h b/utils.h index 6693775..3129038 100644 --- a/utils.h +++ b/utils.h @@ -3,7 +3,7 @@ #pragma once #include -#include +#include #include #include #include @@ -65,15 +65,19 @@ struct llama_vocab { using id = int32_t; using token = std::string; - std::map token_to_id; - std::map id_to_token; - std::map score; + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; }; void replace(std::string & str, const std::string & needle, const std::string & replacement); // poor-man's JSON parsing -std::map json_parse(const std::string & fname); +std::unordered_map json_parse(const std::string & fname); // TODO: temporary until #77 is merged, need this now for some tokenizer tests bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);