Cohere: Use diff tool instead of Copied from mechanism#31211
Cohere: Use diff tool instead of Copied from mechanism#31211younesbelkada wants to merge 5 commits intomainfrom
diff tool instead of Copied from mechanism#31211Conversation
| ALL_LAYERNORM_LAYERS.append(CohereRMSNorm) | ||
|
|
||
|
|
||
| class CohereLayerNorm(CohereRMSNorm): |
There was a problem hiding this comment.
In case users rely on CohereLayerNorm class
| def logit_scale(self): | ||
| logger.warning( | ||
| "`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead." | ||
| ) | ||
| return self.config.logit_scale | ||
|
|
||
| @property | ||
| def tie_word_embeddings(self): | ||
| logger.warning( | ||
| "`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead." | ||
| ) | ||
| return self.config.tie_word_embeddings |
There was a problem hiding this comment.
these attributes are public, but I suggest to use the config variable directly to make it cleaner with a deprecation cycle
| class CohereLinearScalingRotaryEmbedding(CohereRotaryEmbedding): | ||
| """CohereRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" | ||
|
|
||
| def forward(self, x, position_ids): | ||
| # difference to the original RoPE: a scaling factor is aplied to the position ids | ||
| position_ids = position_ids.float() / self.scaling_factor | ||
| cos, sin = super().forward(x, position_ids) | ||
| return cos, sin | ||
|
|
||
|
|
||
| class CohereDynamicNTKScalingRotaryEmbedding(CohereRotaryEmbedding): | ||
| """CohereRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" | ||
|
|
||
| def forward(self, x, position_ids): | ||
| # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length | ||
| seq_len = torch.max(position_ids) + 1 | ||
| if seq_len > self.max_position_embeddings: | ||
| base = self.base * ( | ||
| (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) | ||
| ) ** (self.dim / (self.dim - 2)) | ||
| inv_freq = 1.0 / ( | ||
| base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) | ||
| ) | ||
| self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation | ||
|
|
||
| cos, sin = super().forward(x, position_ids) | ||
| return cos, sin |
There was a problem hiding this comment.
There classes are never used but I couldn't find a way to remove them
There was a problem hiding this comment.
there is no way to do so 😓 Maybe a skip layer?
There was a problem hiding this comment.
hmmm yeah, or maybe it is ok to manually remove them from now
| @property | ||
| def logit_scale(self): | ||
| logger.warning( | ||
| "`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead." | ||
| ) | ||
| return self.config.logit_scale | ||
|
|
||
| @property | ||
| def tie_word_embeddings(self): | ||
| logger.warning( | ||
| "`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead." | ||
| ) | ||
| return self.config.tie_word_embeddings |
There was a problem hiding this comment.
Any idea why these are not propagated in the generated modeling code?
There was a problem hiding this comment.
I'll have to dive a bit into this!
There was a problem hiding this comment.
Ok that's on me to do now!
|
The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update. |
| model_type = "cohere" | ||
| keys_to_ignore_at_inference = ["past_key_values"] | ||
|
|
||
| def __init__( |
There was a problem hiding this comment.
For the init we can use the super.init() and should use super_kwargs to only change the ones that are actually different from the default we have in gemme 😉
| @property | ||
| def logit_scale(self): | ||
| logger.warning( | ||
| "`logit_scale` attribute is going to be deprecated in future versions, please use `model.config.logit_scale` instead." | ||
| ) | ||
| return self.config.logit_scale | ||
|
|
||
| @property | ||
| def tie_word_embeddings(self): | ||
| logger.warning( | ||
| "`tie_word_embeddings` attribute is going to be deprecated in future versions, please use `model.config.tie_word_embeddings` instead." | ||
| ) | ||
| return self.config.tie_word_embeddings |
There was a problem hiding this comment.
Ok that's on me to do now!
| class CohereLinearScalingRotaryEmbedding(CohereRotaryEmbedding): | ||
| """CohereRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" | ||
|
|
||
| def forward(self, x, position_ids): | ||
| # difference to the original RoPE: a scaling factor is aplied to the position ids | ||
| position_ids = position_ids.float() / self.scaling_factor | ||
| cos, sin = super().forward(x, position_ids) | ||
| return cos, sin | ||
|
|
||
|
|
||
| class CohereDynamicNTKScalingRotaryEmbedding(CohereRotaryEmbedding): | ||
| """CohereRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" | ||
|
|
||
| def forward(self, x, position_ids): | ||
| # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length | ||
| seq_len = torch.max(position_ids) + 1 | ||
| if seq_len > self.max_position_embeddings: | ||
| base = self.base * ( | ||
| (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) | ||
| ) ** (self.dim / (self.dim - 2)) | ||
| inv_freq = 1.0 / ( | ||
| base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) | ||
| ) | ||
| self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation | ||
|
|
||
| cos, sin = super().forward(x, position_ids) | ||
| return cos, sin |
There was a problem hiding this comment.
there is no way to do so 😓 Maybe a skip layer?
| _CONFIG_FOR_DOC = "CohereConfig" | ||
|
|
||
|
|
||
| # Copied from transformers.models.llama.modeling_llama._get_unpad_data |
There was a problem hiding this comment.
that's a problem no? the unpad_data should still be present!
There was a problem hiding this comment.
For some reason it has been pasred below .. https://github.com/huggingface/transformers/pull/31211/files#r1647528352
| return attn_output, None, past_key_value | ||
|
|
||
|
|
||
| def _get_unpad_data(attention_mask): |
There was a problem hiding this comment.
the _get_unpad_data is pasted here @ArthurZucker
|
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread. Please note that issues that do not follow the contributing guidelines are likely to be ignored. |
What does this PR do?
As per title
cc @ArthurZucker