diff --git a/.devops/tools.sh b/.devops/tools.sh index b5711c9..352e049 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -34,7 +34,7 @@ else echo "Unknown command: $arg1" echo "Available commands: " echo " --run (-r): Run a model previously converted into ggml" - echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " --convert (-c): Convert a llama model into ggml" echo " ex: \"/models/7B/\" 1" echo " --quantize (-q): Optimize with quantization process ggml" diff --git a/README.md b/README.md index 8cf59f4..7338ea7 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Supported platforms: Here is a typical run using LLaMA-7B: ```java -make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 I llama.cpp build info: I UNAME_S: Darwin I UNAME_P: arm @@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./quantize.sh 7B # run the inference -./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 +./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o Here is an example few-shot interaction, invoked with the command ``` -./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ +./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ -p \ "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. @@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on On complete, you are ready to play! ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with light image: ```bash -docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` ## Limitations diff --git a/ggml.c b/ggml.c index c4f8389..4fb83ad 100644 --- a/ggml.c +++ b/ggml.c @@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - if (cgraph->n_threads <= 0) { - cgraph->n_threads = 8; - } - const int n_threads = cgraph->n_threads; struct ggml_compute_state_shared state_shared = { diff --git a/utils.cpp b/utils.cpp index 26e313d..9e50487 100644 --- a/utils.cpp +++ b/utils.cpp @@ -16,6 +16,18 @@ #endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + // determine sensible default number of threads. + // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +#ifdef __linux__ + std::ifstream cpuinfo("/proc/cpuinfo"); + params.n_threads = std::count(std::istream_iterator(cpuinfo), + std::istream_iterator(), + std::string("processor")); +#endif + if (params.n_threads == 0) { + params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + } + for (int i = 1; i < argc; i++) { std::string arg = argv[i]; diff --git a/utils.h b/utils.h index 021120b..5e5b40f 100644 --- a/utils.h +++ b/utils.h @@ -14,7 +14,7 @@ struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_threads; int32_t n_predict = 128; // new tokens to predict int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_ctx = 512; //context size