Hi, thanks for your excellent course. Recently, I found two (maybe) mistakes during the learning process.
In Ch 6 - The 🤗 Tokenizers library,
Specifically,
In Grouping Entities section,
import numpy as np
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]
idx = 0
while idx < len(predictions):
pred = predictions[idx]
label = model.config.id2label[pred]
if label != "O":
# Remove the B- or I-
label = label[2:]
start, _ = offsets[idx]
# Grab all the tokens labeled with I-label
all_scores = []
while (
idx < len(predictions)
and model.config.id2label[predictions[idx]] == f"I-{label}"
):
- all_scores.append(probabilities[idx][pred])
+ all_scores.append(probs[idx][predictions[idx]])
_, end = offsets[idx]
idx += 1
# The score is the mean of all the scores of the tokens in that grouped entity
score = np.mean(all_scores).item()
word = example[start:end]
results.append(
{
"entity_group": label,
"score": score,
"word": word,
"start": start,
"end": end,
}
)
idx += 1
print(results)
In Handling long contexts,
candidates = []
for start_probs, end_probs in zip(start_probabilities, end_probabilities):
scores = start_probs[:, None] * end_probs[None, :]
idx = torch.triu(scores).argmax().item()
- start_idx = idx // scores.shape[0]
- end_idx = idx % scores.shape[0]
+ start_idx = idx // scores.shape[1]
+ end_idx = idx % scores.shape[1]
score = scores[start_idx, end_idx].item()
candidates.append((start_idx, end_idx, score))
print(candidates)
Hi, thanks for your excellent course. Recently, I found two (maybe) mistakes during the learning process.
In Ch 6 - The 🤗 Tokenizers library,
Specifically,
In Grouping Entities section,
import numpy as np results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] idx = 0 while idx < len(predictions): pred = predictions[idx] label = model.config.id2label[pred] if label != "O": # Remove the B- or I- label = label[2:] start, _ = offsets[idx] # Grab all the tokens labeled with I-label all_scores = [] while ( idx < len(predictions) and model.config.id2label[predictions[idx]] == f"I-{label}" ): - all_scores.append(probabilities[idx][pred]) + all_scores.append(probs[idx][predictions[idx]]) _, end = offsets[idx] idx += 1 # The score is the mean of all the scores of the tokens in that grouped entity score = np.mean(all_scores).item() word = example[start:end] results.append( { "entity_group": label, "score": score, "word": word, "start": start, "end": end, } ) idx += 1 print(results)In Handling long contexts,
candidates = [] for start_probs, end_probs in zip(start_probabilities, end_probabilities): scores = start_probs[:, None] * end_probs[None, :] idx = torch.triu(scores).argmax().item() - start_idx = idx // scores.shape[0] - end_idx = idx % scores.shape[0] + start_idx = idx // scores.shape[1] + end_idx = idx % scores.shape[1] score = scores[start_idx, end_idx].item() candidates.append((start_idx, end_idx, score)) print(candidates)