Don't tell users to use a bad number of threads (#243)

The readme tells people to use the command line option "-t 8", causing 8
threads to be started. On systems with fewer than 8 cores, this causes a
significant slowdown. Remove the option from the example command lines
and use /proc/cpuinfo on Linux to determine a sensible default.
pull/216/head^2
Stephan Walter 1 year ago committed by GitHub
parent 6b0df5ccf3
commit 367946c668
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -34,7 +34,7 @@ else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml" echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --convert (-c): Convert a llama model into ggml" echo " --convert (-c): Convert a llama model into ggml"
echo " ex: \"/models/7B/\" 1" echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"

@ -39,7 +39,7 @@ Supported platforms:
Here is a typical run using LLaMA-7B: Here is a typical run using LLaMA-7B:
```java ```java
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
I llama.cpp build info: I llama.cpp build info:
I UNAME_S: Darwin I UNAME_S: Darwin
I UNAME_P: arm I UNAME_P: arm
@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
./quantize.sh 7B ./quantize.sh 7B
# run the inference # run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
``` ```
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
Here is an example few-shot interaction, invoked with the command Here is an example few-shot interaction, invoked with the command
``` ```
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
-p \ -p \
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on
On complete, you are ready to play! On complete, you are ready to play!
```bash ```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
or with light image: or with light image:
```bash ```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
## Limitations ## Limitations

@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
if (cgraph->n_threads <= 0) {
cgraph->n_threads = 8;
}
const int n_threads = cgraph->n_threads; const int n_threads = cgraph->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {

@ -16,6 +16,18 @@
#endif #endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
#ifdef __linux__
std::ifstream cpuinfo("/proc/cpuinfo");
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
std::istream_iterator<std::string>(),
std::string("processor"));
#endif
if (params.n_threads == 0) {
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
}
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
std::string arg = argv[i]; std::string arg = argv[i];

@ -14,7 +14,7 @@
struct gpt_params { struct gpt_params {
int32_t seed = -1; // RNG seed int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_threads;
int32_t n_predict = 128; // new tokens to predict int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_ctx = 512; //context size int32_t n_ctx = 512; //context size

Loading…
Cancel
Save