@@ -1696,6 +1696,84 @@ def _set_vocab_mistral(self):
16961696 if template is not None :
16971697 self .gguf_writer .add_chat_template (template )
16981698
1699+ def _set_vocab_plamo (self ):
1700+ # PLaMo models use a custom tokenizer with a .jsonl file
1701+ tokenizer_jsonl_path = self .dir_model / "tokenizer.jsonl"
1702+ tokenizer_config_path = self .dir_model / "tokenizer_config.json"
1703+
1704+ if not tokenizer_jsonl_path .is_file ():
1705+ raise FileNotFoundError (f"PLaMo tokenizer file not found: { tokenizer_jsonl_path } " )
1706+
1707+ # Load tokenizer config
1708+ with open (tokenizer_config_path , "r" , encoding = "utf-8" ) as f :
1709+ tokenizer_config = json .load (f )
1710+
1711+ # Load tokens from JSONL file (actually a list format)
1712+ tokens = []
1713+ scores = []
1714+ toktypes = []
1715+
1716+ with open (tokenizer_jsonl_path , "r" , encoding = "utf-8" ) as f :
1717+ for line_num , line in enumerate (f ):
1718+ if line .strip ():
1719+ token_data = json .loads (line )
1720+ # Format: [token, score, type, ?, ?, ?, ?]
1721+ token = token_data [0 ].encode ("utf-8" )
1722+ score = float (token_data [1 ])
1723+ token_type_str = token_data [2 ] if len (token_data ) > 2 else "NORMAL"
1724+
1725+ tokens .append (token )
1726+ scores .append (score )
1727+
1728+ if token_type_str == "UNKNOWN" :
1729+ toktypes .append (gguf .TokenType .UNKNOWN )
1730+ elif token_type_str == "CONTROL" :
1731+ toktypes .append (gguf .TokenType .CONTROL )
1732+ elif token_type_str == "BYTE" :
1733+ toktypes .append (gguf .TokenType .BYTE )
1734+ else :
1735+ token_str = token_data [0 ]
1736+ if token_str .startswith ("<|plamo:" ) and token_str .endswith ("|>" ):
1737+ toktypes .append (gguf .TokenType .CONTROL )
1738+ else :
1739+ toktypes .append (gguf .TokenType .NORMAL )
1740+
1741+ vocab_size = self .hparams ["vocab_size" ]
1742+ if vocab_size > len (tokens ):
1743+ pad_count = vocab_size - len (tokens )
1744+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
1745+ for i in range (1 , pad_count + 1 ):
1746+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
1747+ scores .append (- 1000.0 )
1748+ toktypes .append (gguf .TokenType .UNUSED )
1749+
1750+ self .gguf_writer .add_tokenizer_model ("plamo2" )
1751+ self .gguf_writer .add_tokenizer_pre ("default" )
1752+ self .gguf_writer .add_token_list (tokens )
1753+ self .gguf_writer .add_token_scores (scores )
1754+ self .gguf_writer .add_token_types (toktypes )
1755+
1756+ if "bos_token" in tokenizer_config and tokenizer_config ["bos_token" ] is not None :
1757+ token_id = tokens .index (tokenizer_config ["bos_token" ].encode ("utf-8" ))
1758+ self .gguf_writer .add_bos_token_id (token_id )
1759+ if "eos_token" in tokenizer_config and tokenizer_config ["eos_token" ] is not None :
1760+ token_id = tokens .index (tokenizer_config ["eos_token" ].encode ("utf-8" ))
1761+ self .gguf_writer .add_eos_token_id (token_id )
1762+ if "pad_token" in tokenizer_config and tokenizer_config ["pad_token" ] is not None :
1763+ token_id = tokens .index (tokenizer_config ["pad_token" ].encode ("utf-8" ))
1764+ self .gguf_writer .add_pad_token_id (token_id )
1765+ if "sep_token" in tokenizer_config and tokenizer_config ["sep_token" ] is not None :
1766+ token_id = tokens .index (tokenizer_config ["sep_token" ].encode ("utf-8" ))
1767+ self .gguf_writer .add_sep_token_id (token_id )
1768+ if "unk_token" in tokenizer_config and tokenizer_config ["unk_token" ] is not None :
1769+ token_id = tokens .index (tokenizer_config ["unk_token" ].encode ("utf-8" ))
1770+ self .gguf_writer .add_unk_token_id (token_id )
1771+
1772+ # Add <|plamo:op|> as EOT to ensure appropriate end of generation
1773+ self .gguf_writer .add_eot_token_id (4 )
1774+
1775+ self .gguf_writer .add_add_space_prefix (False )
1776+
16991777
17001778class MmprojModel (ModelBase ):
17011779 model_type = ModelType .MMPROJ
@@ -4798,87 +4876,7 @@ class Plamo2Model(TextModel):
47984876 model_arch = gguf .MODEL_ARCH .PLAMO2
47994877
48004878 def set_vocab (self ):
4801- # PLaMo 2 uses a custom tokenizer with a .jsonl file
4802- # We need to handle this specially
4803- tokenizer_jsonl_path = self .dir_model / "tokenizer.jsonl"
4804- tokenizer_config_path = self .dir_model / "tokenizer_config.json"
4805-
4806- if not tokenizer_jsonl_path .is_file ():
4807- raise FileNotFoundError (f"PLaMo 2 tokenizer file not found: { tokenizer_jsonl_path } " )
4808-
4809- # Load tokenizer config
4810- with open (tokenizer_config_path , 'r' , encoding = 'utf-8' ) as f :
4811- tokenizer_config = json .load (f )
4812-
4813- # Load tokens from JSONL file (actually a list format)
4814- tokens = []
4815- scores = []
4816- toktypes = []
4817-
4818- with open (tokenizer_jsonl_path , 'r' , encoding = 'utf-8' ) as f :
4819- for line_num , line in enumerate (f ):
4820- if line .strip ():
4821- token_data = json .loads (line )
4822- # Format: [token, score, type, ?, ?, ?, ?]
4823- token = token_data [0 ].encode ("utf-8" )
4824- score = float (token_data [1 ])
4825- token_type_str = token_data [2 ] if len (token_data ) > 2 else "NORMAL"
4826-
4827- tokens .append (token )
4828- scores .append (score )
4829-
4830- # Map token type strings to GGUF token types
4831- if token_type_str == "UNKNOWN" :
4832- toktypes .append (gguf .TokenType .UNKNOWN )
4833- elif token_type_str == "CONTROL" :
4834- toktypes .append (gguf .TokenType .CONTROL )
4835- elif token_type_str == "BYTE" :
4836- toktypes .append (gguf .TokenType .BYTE )
4837- else :
4838- # Check for PLaMo-2 special tokens
4839- token_str = token_data [0 ]
4840- if token_str .startswith ("<|plamo:" ) and token_str .endswith ("|>" ):
4841- toktypes .append (gguf .TokenType .CONTROL )
4842- else :
4843- toktypes .append (gguf .TokenType .NORMAL )
4844-
4845- vocab_size = self .hparams ["vocab_size" ]
4846- if vocab_size > len (tokens ):
4847- pad_count = vocab_size - len (tokens )
4848- logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
4849- for i in range (1 , pad_count + 1 ):
4850- tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
4851- scores .append (- 1000.0 )
4852- toktypes .append (gguf .TokenType .UNUSED )
4853-
4854- # Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
4855- self .gguf_writer .add_tokenizer_model ("plamo2" )
4856- self .gguf_writer .add_tokenizer_pre ("default" )
4857- self .gguf_writer .add_token_list (tokens )
4858- self .gguf_writer .add_token_scores (scores )
4859- self .gguf_writer .add_token_types (toktypes )
4860-
4861- # Add special tokens from config
4862- if "bos_token" in tokenizer_config and tokenizer_config ["bos_token" ] is not None :
4863- token_id = tokens .index (tokenizer_config ["bos_token" ].encode ("utf-8" ))
4864- self .gguf_writer .add_bos_token_id (token_id )
4865- if "eos_token" in tokenizer_config and tokenizer_config ["eos_token" ] is not None :
4866- token_id = tokens .index (tokenizer_config ["eos_token" ].encode ("utf-8" ))
4867- self .gguf_writer .add_eos_token_id (token_id )
4868- if "pad_token" in tokenizer_config and tokenizer_config ["pad_token" ] is not None :
4869- token_id = tokens .index (tokenizer_config ["pad_token" ].encode ("utf-8" ))
4870- self .gguf_writer .add_pad_token_id (token_id )
4871- if "sep_token" in tokenizer_config and tokenizer_config ["sep_token" ] is not None :
4872- token_id = tokens .index (tokenizer_config ["sep_token" ].encode ("utf-8" ))
4873- self .gguf_writer .add_sep_token_id (token_id )
4874- if "unk_token" in tokenizer_config and tokenizer_config ["unk_token" ] is not None :
4875- token_id = tokens .index (tokenizer_config ["unk_token" ].encode ("utf-8" ))
4876- self .gguf_writer .add_unk_token_id (token_id )
4877-
4878- # Add <|plamo:op|> as EOT to ensure appropriate end of generation
4879- self .gguf_writer .add_eot_token_id (4 )
4880-
4881- self .gguf_writer .add_add_space_prefix (False )
4879+ self ._set_vocab_plamo ()
48824880
48834881 def set_gguf_parameters (self ):
48844882 hparams = self .hparams
@@ -4966,6 +4964,56 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49664964 return [(new_name , data_torch )]
49674965
49684966
4967+ @ModelBase .register ("Plamo3ForCausalLM" , "PLaMo3ForCausalLM" )
4968+ class Plamo3Model (TextModel ):
4969+ model_arch = gguf .MODEL_ARCH .PLAMO3
4970+
4971+ def set_vocab (self ):
4972+ self ._set_vocab_plamo ()
4973+
4974+ tokenizer_config_path = self .dir_model / "tokenizer_config.json"
4975+ tokenizer_config = {}
4976+
4977+ if tokenizer_config_path .is_file ():
4978+ with open (tokenizer_config_path , encoding = "utf-8" ) as f :
4979+ tokenizer_config = json .load (f )
4980+
4981+ chat_template = tokenizer_config .get ("chat_template" )
4982+ chat_template_jinja = self .dir_model / "chat_template.jinja"
4983+
4984+ if chat_template_jinja .is_file ():
4985+ with open (chat_template_jinja , encoding = "utf-8" ) as f :
4986+ chat_template = f .read ()
4987+
4988+ if chat_template :
4989+ self .gguf_writer .add_chat_template (chat_template )
4990+
4991+ def set_gguf_parameters (self ):
4992+ super ().set_gguf_parameters ()
4993+ self .gguf_writer .add_vocab_size (self .hparams ["vocab_size" ])
4994+ if (sliding_window := self .find_hparam (["window_size" , "sliding_window" ], optional = True )) is not None :
4995+ self .gguf_writer .add_sliding_window (sliding_window )
4996+ self .gguf_writer .add_sliding_window_pattern (self .hparams ["sliding_window_pattern" ])
4997+ self .gguf_writer .add_rope_freq_base_swa (self .rope_parameters .get ("sliding_attention" , {"rope_theta" : self .hparams .get ("rope_local_theta" )})["rope_theta" ])
4998+
4999+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
5000+
5001+ if name .endswith (".pre_mixer_norm.weight" ):
5002+ data_torch = data_torch + 1.0
5003+ elif name .endswith (".post_mixer_norm.weight" ):
5004+ data_torch = data_torch + 1.0 / 5
5005+ elif name .endswith (".pre_mlp_norm.weight" ):
5006+ data_torch = data_torch + 1.0
5007+ elif name .endswith (".post_mlp_norm.weight" ):
5008+ data_torch = data_torch + 1.0 / (5 ** 1.5 )
5009+ elif name .endswith ((".mixer.q_norm.weight" , ".mixer.k_norm.weight" )):
5010+ data_torch = data_torch + 1.0
5011+ elif name .endswith (".norm.weight" ):
5012+ data_torch = data_torch + 1.0
5013+
5014+ return [(self .map_tensor_name (name ), data_torch )]
5015+
5016+
49695017@ModelBase .register ("CodeShellForCausalLM" )
49705018class CodeShellModel (TextModel ):
49715019 model_arch = gguf .MODEL_ARCH .CODESHELL
0 commit comments