@ -6,6 +6,7 @@
# include <regex>
# include <iostream>
# include <iterator>
# include <queue>
# include <string>
# include <math.h>
@ -294,58 +295,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
return tokens ;
}
// TODO: Calculate this constant from the vocabulary
# define MAX_TOKEN_LEN 18
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
std : : vector < gpt_vocab : : id > llama_tokenize ( const gpt_vocab & vocab , const std : : string & text , bool bos ) {
std : : vector < gpt_vocab : : id > res ;
std : : vector < int > score ;
std : : vector < gpt_vocab : : id > prev ;
int len = text . length ( ) ;
static size_t utf8_len ( char src ) {
const size_t lookup [ ] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 3 , 4 } ;
uint8_t highbits = static_cast < uint8_t > ( src ) > > 4 ;
return lookup [ highbits ] ;
}
struct llama_sp_symbol {
using index = int ;
index prev ;
index next ;
std : : string_view text ;
} ;
struct llama_sp_bigram {
struct comparator {
bool operator ( ) ( llama_sp_bigram & l , llama_sp_bigram & r ) {
return ( l . score < r . score ) | | ( l . score = = r . score & & l . left > r . left ) ;
}
} ;
using queue_storage = std : : vector < llama_sp_bigram > ;
using queue = std : : priority_queue < llama_sp_bigram , queue_storage , comparator > ;
llama_sp_symbol : : index left ;
llama_sp_symbol : : index right ;
float score ;
size_t size ;
} ;
struct llama_tokenizer {
llama_tokenizer ( const gpt_vocab & vocab ) : vocab_ ( vocab ) { }
void tokenize ( std : : string_view text , std : : vector < gpt_vocab : : id > & output ) {
// split string into utf8 chars
int index = 0 ;
while ( ! text . empty ( ) ) {
llama_sp_symbol sym ;
size_t char_len = std : : min ( text . size ( ) , utf8_len ( text . data ( ) [ 0 ] ) ) ;
sym . text = std : : string_view ( text . data ( ) , char_len ) ;
sym . prev = index - 1 ;
text . remove_prefix ( char_len ) ;
sym . next = text . empty ( ) ? - 1 : index + 1 ;
index + + ;
symbols_ . emplace_back ( std : : move ( sym ) ) ;
}
// seed the work queue with all possible 2-character tokens.
for ( size_t i = 1 ; i < symbols_ . size ( ) ; + + i ) {
try_add_bigram ( i - 1 , i ) ;
}
// keep substituting the highest frequency pairs for as long as we can.
while ( ! work_queue_ . empty ( ) ) {
auto bigram = work_queue_ . top ( ) ;
work_queue_ . pop ( ) ;
auto & left_sym = symbols_ [ bigram . left ] ;
auto & right_sym = symbols_ [ bigram . right ] ;
// if one of the symbols already got merged, skip it.
if ( left_sym . text . empty ( ) | | right_sym . text . empty ( ) | |
left_sym . text . size ( ) + right_sym . text . size ( ) ! = bigram . size ) {
continue ;
}
score . resize ( len + 1 ) ;
prev . resize ( len + 1 ) ;
// merge the right sym into the left one
left_sym . text = std : : string_view ( left_sym . text . data ( ) , left_sym . text . size ( ) + right_sym . text . size ( ) ) ;
right_sym . text = std : : string_view ( " " ) ;
// Forward pass
for ( int i = 0 ; i < len ; i + + ) {
int max_len = std : : min ( len - i , MAX_TOKEN_LEN ) ;
for ( int sub_len = 1 ; sub_len < = max_len ; sub_len + + ) {
auto sub = text . substr ( i , sub_len ) ;
auto token = vocab . token_to_id . find ( sub ) ;
if ( token ! = vocab . token_to_id . end ( ) ) {
int token_score = sub . length ( ) * sub . length ( ) ;
int local_score = score [ i ] + token_score ;
int next = i + sub_len ;
if ( score [ next ] < local_score ) {
score [ next ] = local_score ;
prev [ next ] = ( * token ) . second ;
// remove the right sym from the chain
left_sym . next = right_sym . next ;
if ( right_sym . next > = 0 ) {
symbols_ [ right_sym . next ] . prev = bigram . left ;
}
// find more substitutions
try_add_bigram ( left_sym . prev , bigram . left ) ;
try_add_bigram ( bigram . left , left_sym . next ) ;
}
for ( int i = 0 ; i ! = - 1 ; i = symbols_ [ i ] . next ) {
auto & symbol = symbols_ [ i ] ;
auto token = vocab_ . token_to_id . find ( std : : string ( symbol . text ) ) ;
if ( token = = vocab_ . token_to_id . end ( ) ) {
// output any symbols that did not form tokens as bytes.
for ( int j = 0 ; j < symbol . text . size ( ) ; + + j ) {
gpt_vocab : : id token_id = static_cast < uint8_t > ( symbol . text [ j ] ) + 3 ;
output . push_back ( token_id ) ;
}
} else {
output . push_back ( ( * token ) . second ) ;
}
}
}
// Backward pass
int i = len ;
while ( i > 0 ) {
gpt_vocab : : id token_id = prev [ i ] ;
if ( token_id = = 0 ) {
// TODO: Return error or something more meaningful
printf ( " failed to tokenize string! \n " ) ;
break ;
private :
void try_add_bigram ( int left , int right ) {
if ( left = = - 1 | | right = = - 1 ) {
return ;
}
res . push_back ( token_id ) ;
auto token = ( * vocab . id_to_token . find ( token_id ) ) . second ;
i - = token . length ( ) ;
std : : string_view text ( symbols_ [ left ] . text . data ( ) , symbols_ [ left ] . text . size ( ) + symbols_ [ right ] . text . size ( ) ) ;
auto token = vocab_ . token_to_id . find ( std : : string ( text ) ) ;
if ( token = = vocab_ . token_to_id . end ( ) ) {
return ;
}
if ( bos ) {
res . push_back ( 1 ) ; // TODO: replace with vocab.bos
auto score = vocab_ . score . find ( ( * token ) . second ) ;
if ( score = = vocab_ . score . end ( ) ) {
return ;
}
llama_sp_bigram bigram ;
bigram . left = left ;
bigram . right = right ;
bigram . score = ( * score ) . second ;
bigram . size = text . size ( ) ;
work_queue_ . push ( bigram ) ;
}
const gpt_vocab & vocab_ ;
std : : vector < llama_sp_symbol > symbols_ ;
llama_sp_bigram : : queue work_queue_ ;
} ;
std : : vector < gpt_vocab : : id > llama_tokenize ( const gpt_vocab & vocab , std : : string_view text , bool bos ) {
llama_tokenizer tokenizer ( vocab ) ;
std : : vector < gpt_vocab : : id > output ;
if ( text . size ( ) = = 0 ) {
return output ;
}
// Pieces are in reverse order so correct that
std : : reverse ( res . begin ( ) , res . end ( ) ) ;
if ( bos ) {
output . push_back ( 1 ) ;
}
return res ;
tokenizer . tokenize ( text , output ) ;
return output ;
}
bool gpt_vocab_init ( const std : : string & fname , gpt_vocab & vocab ) {