@ -176,8 +176,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
}
}
}
}
const ggml_type wtype2 = GGML_TYPE_F32 ;
auto & ctx = model . ctx ;
auto & ctx = model . ctx ;
size_t ctx_size = 0 ;
size_t ctx_size = 0 ;
@ -237,7 +235,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
const int n_embd = hparams . n_embd ;
const int n_embd = hparams . n_embd ;
const int n_layer = hparams . n_layer ;
const int n_layer = hparams . n_layer ;
const int n_ctx = hparams . n_ctx ;
const int n_vocab = hparams . n_vocab ;
const int n_vocab = hparams . n_vocab ;
model . layers . resize ( n_layer ) ;
model . layers . resize ( n_layer ) ;
@ -539,8 +536,6 @@ bool llama_eval(
const int n_vocab = hparams . n_vocab ;
const int n_vocab = hparams . n_vocab ;
const int n_rot = hparams . n_embd / hparams . n_head ;
const int n_rot = hparams . n_embd / hparams . n_head ;
const int d_key = n_embd / n_head ;
// TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
// TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
// static size_t buf_size = hparams.n_ctx*1024*1024;
// static size_t buf_size = hparams.n_ctx*1024*1024;
static size_t buf_size = 512u * 1024 * 1024 ;
static size_t buf_size = 512u * 1024 * 1024 ;
@ -849,9 +844,25 @@ int main(int argc, char ** argv) {
params . n_predict = std : : min ( params . n_predict , model . hparams . n_ctx - ( int ) embd_inp . size ( ) ) ;
params . n_predict = std : : min ( params . n_predict , model . hparams . n_ctx - ( int ) embd_inp . size ( ) ) ;
// prefix & suffix for instruct mode
const std : : vector < gpt_vocab : : id > inp_pfx = : : llama_tokenize ( vocab , " \n \n ### Instruction: \n \n " , true ) ;
const std : : vector < gpt_vocab : : id > inp_sfx = : : llama_tokenize ( vocab , " \n \n ### Response: \n \n " , false ) ;
// in instruct mode, we inject a prefix and a suffix to each input by the user
if ( params . instruct ) {
fprintf ( stderr , " == Instruction mode enabled == \n " ) ;
params . interactive = true ;
params . antiprompt = " ### Instruction: \n \n " ;
}
// tokenize the reverse prompt
// tokenize the reverse prompt
std : : vector < gpt_vocab : : id > antiprompt_inp = : : llama_tokenize ( vocab , params . antiprompt , false ) ;
std : : vector < gpt_vocab : : id > antiprompt_inp = : : llama_tokenize ( vocab , params . antiprompt , false ) ;
// enable interactive mode if reverse prompt is specified
if ( ! antiprompt_inp . empty ( ) ) {
params . interactive = true ;
}
fprintf ( stderr , " \n " ) ;
fprintf ( stderr , " \n " ) ;
fprintf ( stderr , " %s: prompt: '%s' \n " , __func__ , params . prompt . c_str ( ) ) ;
fprintf ( stderr , " %s: prompt: '%s' \n " , __func__ , params . prompt . c_str ( ) ) ;
fprintf ( stderr , " %s: number of tokens in prompt = %zu \n " , __func__ , embd_inp . size ( ) ) ;
fprintf ( stderr , " %s: number of tokens in prompt = %zu \n " , __func__ , embd_inp . size ( ) ) ;
@ -872,7 +883,7 @@ int main(int argc, char ** argv) {
fprintf ( stderr , " %s: interactive mode on. \n " , __func__ ) ;
fprintf ( stderr , " %s: interactive mode on. \n " , __func__ ) ;
if ( antiprompt_inp . size ( ) ) {
if ( antiprompt_inp . size ( ) ) {
fprintf ( stderr , " %s: reverse prompt: '%s' \n " , __func__ , params . antiprompt . c_str ( ) ) ;
fprintf ( stderr , " %s: reverse prompt: '%s' \n " , __func__ , params . antiprompt . c_str ( ) ) ;
fprintf ( stderr , " %s: number of tokens in reverse prompt = %zu \n " , __func__ , antiprompt_inp . size ( ) ) ;
fprintf ( stderr , " %s: number of tokens in reverse prompt = %zu \n " , __func__ , antiprompt_inp . size ( ) ) ;
for ( int i = 0 ; i < ( int ) antiprompt_inp . size ( ) ; i + + ) {
for ( int i = 0 ; i < ( int ) antiprompt_inp . size ( ) ; i + + ) {
@ -894,31 +905,27 @@ int main(int argc, char ** argv) {
std : : vector < gpt_vocab : : id > last_n_tokens ( last_n_size ) ;
std : : vector < gpt_vocab : : id > last_n_tokens ( last_n_size ) ;
std : : fill ( last_n_tokens . begin ( ) , last_n_tokens . end ( ) , 0 ) ;
std : : fill ( last_n_tokens . begin ( ) , last_n_tokens . end ( ) , 0 ) ;
if ( params . interactive ) {
if ( params . interactive ) {
fprintf ( stderr , " == Running in interactive mode. == \n "
fprintf ( stderr , " == Running in interactive mode. == \n "
# if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
# if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time. \n "
" - Press Ctrl+C to interject at any time. \n "
# endif
# endif
" - Press Return to return control to LLaMa. \n "
" - Press Return to return control to LLaMa. \n "
" - If you want to submit another line, end your input in ' \\ '. \n " ) ;
" - If you want to submit another line, end your input in ' \\ '. \n \n " ) ;
is_interacting = true ;
}
}
int remaining_tokens = params . n_predict ;
int input_consumed = 0 ;
int input_consumed = 0 ;
bool input_noecho = false ;
bool input_noecho = false ;
// prompt user immediately after the starting prompt has been loaded
int remaining_tokens = params . n_predict ;
if ( params . interactive_start ) {
is_interacting = true ;
}
// set the color for the prompt which will be output initially
// set the color for the prompt which will be output initially
if ( params . use_color ) {
if ( params . use_color ) {
printf ( ANSI_COLOR_YELLOW ) ;
printf ( ANSI_COLOR_YELLOW ) ;
}
}
while ( remaining_tokens > 0 ) {
while ( remaining_tokens > 0 | | params . interactive ) {
// predict
// predict
if ( embd . size ( ) > 0 ) {
if ( embd . size ( ) > 0 ) {
const int64_t t_start_us = ggml_time_us ( ) ;
const int64_t t_start_us = ggml_time_us ( ) ;
@ -971,13 +978,13 @@ int main(int argc, char ** argv) {
last_n_tokens . erase ( last_n_tokens . begin ( ) ) ;
last_n_tokens . erase ( last_n_tokens . begin ( ) ) ;
last_n_tokens . push_back ( embd_inp [ input_consumed ] ) ;
last_n_tokens . push_back ( embd_inp [ input_consumed ] ) ;
+ + input_consumed ;
+ + input_consumed ;
if ( embd . size ( ) > params . n_batch ) {
if ( ( int ) embd . size ( ) > params . n_batch ) {
break ;
break ;
}
}
}
}
// reset color to default if we there is no pending user input
// reset color to default if we there is no pending user input
if ( ! input_noecho & & params . use_color & & embd_inp . size ( ) = = input_consumed ) {
if ( ! input_noecho & & params . use_color & & ( int ) embd_inp . size ( ) = = input_consumed ) {
printf ( ANSI_COLOR_RESET ) ;
printf ( ANSI_COLOR_RESET ) ;
}
}
}
}
@ -999,19 +1006,26 @@ int main(int argc, char ** argv) {
is_interacting = true ;
is_interacting = true ;
}
}
if ( is_interacting ) {
if ( is_interacting ) {
if ( params . instruct ) {
input_consumed = embd_inp . size ( ) ;
embd_inp . insert ( embd_inp . end ( ) , inp_pfx . begin ( ) , inp_pfx . end ( ) ) ;
printf ( " \n > " ) ;
}
// currently being interactive
// currently being interactive
bool another_line = true ;
bool another_line = true ;
while ( another_line ) {
while ( another_line ) {
fflush ( stdout ) ;
fflush ( stdout ) ;
char buf [ 256 ] = { 0 } ;
char buf [ 256 ] = { 0 } ;
int n_read ;
int n_read ;
if ( params . use_color ) printf ( ANSI_BOLD ANSI_COLOR_GREEN ) ;
if ( params . use_color ) printf ( ANSI_BOLD ANSI_COLOR_GREEN ) ;
if ( scanf ( " %255[^ \n ]%n%*c " , buf , & n_read ) < = 0 ) {
if ( scanf ( " %255[^ \n ]%n%*c " , buf , & n_read ) < = 0 ) {
// presumable empty line, consume the newline
// presumable empty line, consume the newline
std : : ignore = scanf ( " %*c " ) ;
std : : ignore = scanf ( " %*c " ) ;
n_read = 0 ;
n_read = 0 ;
}
}
if ( params . use_color ) printf ( ANSI_COLOR_RESET ) ;
if ( params . use_color ) printf ( ANSI_COLOR_RESET ) ;
if ( n_read > 0 & & buf [ n_read - 1 ] = = ' \\ ' ) {
if ( n_read > 0 & & buf [ n_read - 1 ] = = ' \\ ' ) {
another_line = true ;
another_line = true ;
@ -1026,6 +1040,10 @@ int main(int argc, char ** argv) {
std : : vector < gpt_vocab : : id > line_inp = : : llama_tokenize ( vocab , buf , false ) ;
std : : vector < gpt_vocab : : id > line_inp = : : llama_tokenize ( vocab , buf , false ) ;
embd_inp . insert ( embd_inp . end ( ) , line_inp . begin ( ) , line_inp . end ( ) ) ;
embd_inp . insert ( embd_inp . end ( ) , line_inp . begin ( ) , line_inp . end ( ) ) ;
if ( params . instruct ) {
embd_inp . insert ( embd_inp . end ( ) , inp_sfx . begin ( ) , inp_sfx . end ( ) ) ;
}
remaining_tokens - = line_inp . size ( ) ;
remaining_tokens - = line_inp . size ( ) ;
input_noecho = true ; // do not echo this again
input_noecho = true ; // do not echo this again
@ -1037,10 +1055,14 @@ int main(int argc, char ** argv) {
// end of text token
// end of text token
if ( embd . back ( ) = = 2 ) {
if ( embd . back ( ) = = 2 ) {
if ( params . interactive ) {
is_interacting = true ;
} else {
fprintf ( stderr , " [end of text] \n " ) ;
fprintf ( stderr , " [end of text] \n " ) ;
break ;
break ;
}
}
}
}
}
# if defined (_WIN32)
# if defined (_WIN32)
signal ( SIGINT , SIG_DFL ) ;
signal ( SIGINT , SIG_DFL ) ;