Skip to content

Commit 316319d

Browse files
committed
Warn if a file contains UTF-8 and Latin-1
Add a new warning non-ascii-utf8 displayed only if the non-ascii attribute is specified and UTF-8 characters were ignored in the copyright or authors lines in the header.
1 parent a98a996 commit 316319d

1 file changed

Lines changed: 19 additions & 2 deletions

File tree

tools/check-typo

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,13 @@ IGNORE_DIRS="
162162
(cat "$f" | tr -d '\r'; echo) \
163163
| awk -v rules="$rules" -v svnrules="$svnrules" -v file="$f" \
164164
'
165+
function is_err(name) {
166+
return (("," rules svnrules ",") !~ ("[, ]" name "[, ]"));
167+
}
168+
165169
function err(name, msg) {
166170
++ counts[name];
167-
if (("," rules svnrules ",") !~ ("[, ]" name "[, ]") \
168-
&& counts[name] <= 10){
171+
if (is_err(name) && counts[name] <= 10){
169172
printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH);
170173
printf (" [%s] %s\n", name, msg);
171174
got_errors = 1;
@@ -207,6 +210,10 @@ IGNORE_DIRS="
207210
match($0, /[\200-\377]/) \
208211
&& state != "authors" && state != "copyright" {
209212
err("non-ascii", "non-ASCII character(s)");
213+
if (header_utf8 && !is_err("non-ascii")) {
214+
err("non-ascii-utf8", \
215+
"non-ASCII character(s) AND UTF-8 encountered");
216+
}
210217
}
211218
212219
match($0, /[^\t\200-\377 -~]/) {
@@ -237,6 +244,16 @@ IGNORE_DIRS="
237244
err("very-long-line", "line is over 132 columns");
238245
}
239246
247+
# Record that the header contained UTF-8 sequences
248+
match($0, /[\300-\367][\200-\277]+/) \
249+
&& (state == "authors" || state == "copyright") {
250+
header_utf8 = 1;
251+
if (counts["non-ascii"] > 0 && is_err("non-ascii")) {
252+
err("non-ascii-utf8", \
253+
"non-ASCII character(s) AND UTF-8 encountered");
254+
}
255+
}
256+
240257
# Header-recognition automaton. Read this from bottom to top.
241258
# Valid UTF-8 chars are recognised in copyright and authors
242259
# TODO: ensure all files are valid UTF-8 before awking them.

0 commit comments

Comments
 (0)