@ -86,7 +86,7 @@ struct llama_model {
} ;
} ;
// load the model's weights from a file
// load the model's weights from a file
bool llama_model_load ( const std : : string & fname , llama_model & model , gpt_vocab & vocab , int n_ctx ) {
bool llama_model_load ( const std : : string & fname , llama_model & model , gpt_vocab & vocab , int n_ctx , ggml_type memory_type = GGML_TYPE_F32 ) {
fprintf ( stderr , " %s: loading model from '%s' - please wait ... \n " , __func__ , fname . c_str ( ) ) ;
fprintf ( stderr , " %s: loading model from '%s' - please wait ... \n " , __func__ , fname . c_str ( ) ) ;
std : : vector < char > f_buf ( 1024 * 1024 ) ;
std : : vector < char > f_buf ( 1024 * 1024 ) ;
@ -207,8 +207,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
ctx_size + = n_layer * ( n_ff * n_embd * ggml_type_sizef ( wtype ) ) ; // w2
ctx_size + = n_layer * ( n_ff * n_embd * ggml_type_sizef ( wtype ) ) ; // w2
ctx_size + = n_layer * ( n_ff * n_embd * ggml_type_sizef ( wtype ) ) ; // w3
ctx_size + = n_layer * ( n_ff * n_embd * ggml_type_sizef ( wtype ) ) ; // w3
ctx_size + = n_ctx * n_layer * n_embd * ggml_type_sizef ( GGML_TYPE_F32 ) ; // memory_k
ctx_size + = n_ctx * n_layer * n_embd * ggml_type_sizef ( memory_type ) ; // memory_k
ctx_size + = n_ctx * n_layer * n_embd * ggml_type_sizef ( GGML_TYPE_F32 ) ; // memory_v
ctx_size + = n_ctx * n_layer * n_embd * ggml_type_sizef ( memory_type ) ; // memory_v
ctx_size + = ( 5 + 10 * n_layer ) * 256 ; // object overhead
ctx_size + = ( 5 + 10 * n_layer ) * 256 ; // object overhead
@ -293,8 +293,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
const int n_mem = n_layer * n_ctx ;
const int n_mem = n_layer * n_ctx ;
const int n_elements = n_embd * n_mem ;
const int n_elements = n_embd * n_mem ;
model . memory_k = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_elements ) ;
model . memory_k = ggml_new_tensor_1d ( ctx , memory_type , n_elements ) ;
model . memory_v = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_elements ) ;
model . memory_v = ggml_new_tensor_1d ( ctx , memory_type , n_elements ) ;
const size_t memory_size = ggml_nbytes ( model . memory_k ) + ggml_nbytes ( model . memory_v ) ;
const size_t memory_size = ggml_nbytes ( model . memory_k ) + ggml_nbytes ( model . memory_v ) ;
@ -814,8 +814,9 @@ int main(int argc, char ** argv) {
// load the model
// load the model
{
{
const ggml_type memory_type = params . memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32 ;
const int64_t t_start_us = ggml_time_us ( ) ;
const int64_t t_start_us = ggml_time_us ( ) ;
if ( ! llama_model_load ( params . model , model , vocab , params . n_ctx )) {
if ( ! llama_model_load ( params . model , model , vocab , params . n_ctx , memory_type )) {
fprintf ( stderr , " %s: failed to load model from '%s' \n " , __func__ , params . model . c_str ( ) ) ;
fprintf ( stderr , " %s: failed to load model from '%s' \n " , __func__ , params . model . c_str ( ) ) ;
return 1 ;
return 1 ;
}
}