diff --git a/main.cpp b/main.cpp index 431c94b..5ba6d5a 100644 --- a/main.cpp +++ b/main.cpp @@ -258,6 +258,9 @@ int main(int argc, char ** argv) { params.interactive = true; } + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); @@ -359,6 +362,16 @@ int main(int argc, char ** argv) { last_n_tokens.push_back(id); } + // replace end of text token with newline token when in interactive mode + if (id == llama_token_eos() && params.interactive) { + id = llama_token_newline.front(); + if (params.antiprompt.size() != 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + // add it to the context embd.push_back(id); @@ -451,12 +464,8 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == llama_token_eos()) { - if (params.interactive) { - is_interacting = true; - } else { - fprintf(stderr, " [end of text]\n"); - break; - } + fprintf(stderr, " [end of text]\n"); + break; } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.