Skip to content

Commit 9ad094e

Browse files
committed
Emit T_BAD_CHARACTER for unexpected characters
Avoid having holes in the token stream which are annoying and inefficient to reconstruct on the consumer side.
1 parent af76396 commit 9ad094e

File tree

5 files changed

+60
-2
lines changed

5 files changed

+60
-2
lines changed

UPGRADING

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ PHP 7.4 UPGRADE NOTES
132132
PASSWORD_BCRYPT, PASSWORD_ARGON2I, and PASSWORD_ARGON2ID will continue to
133133
function correctly.
134134

135+
- Tokenizer:
136+
. token_get_all() will now emit a T_BAD_CHARACTER token for unexpected
137+
characters instead of leaving behind holes in the token stream.
138+
135139
========================================
136140
2. New Features
137141
========================================

Zend/zend_language_parser.y

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ static YYSIZE_T zend_yytnamerr(char*, const char*);
221221
%token T_COALESCE "?? (T_COALESCE)"
222222
%token T_POW "** (T_POW)"
223223
%token T_POW_EQUAL "**= (T_POW_EQUAL)"
224+
%token T_BAD_CHARACTER "invalid character (T_BAD_CHARACTER)"
224225

225226
/* Token used to force a parse error from the lexer */
226227
%token T_ERROR

Zend/zend_language_scanner.l

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2847,8 +2847,12 @@ nowdoc_scan_done:
28472847
RETURN_TOKEN(END);
28482848
}
28492849
2850-
zend_error(E_COMPILE_WARNING,"Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE);
2851-
goto restart;
2850+
zend_error(E_COMPILE_WARNING, "Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE);
2851+
if (PARSER_MODE()) {
2852+
goto restart;
2853+
} else {
2854+
RETURN_TOKEN(T_BAD_CHARACTER);
2855+
}
28522856
}
28532857
28542858
*/
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
--TEST--
2+
token_get_all() produces T_BAD_CHARACTER for unexpected characters
3+
--FILE--
4+
<?php
5+
6+
$codes = [
7+
"<?php \0 foo",
8+
"<?php \1 bar",
9+
"<?php \1\2 bar \3",
10+
];
11+
12+
foreach ($codes as $code) {
13+
foreach (token_get_all($code) as $token) {
14+
if (is_array($token)) {
15+
echo token_name($token[0]), " ", strlen($token[1]), "\n";
16+
} else {
17+
echo $token, "\n";
18+
}
19+
}
20+
}
21+
22+
?>
23+
--EXPECTF--
24+
Warning: Unexpected character in input: ' in %s on line %d
25+
T_OPEN_TAG 6
26+
T_BAD_CHARACTER 1
27+
T_WHITESPACE 1
28+
T_STRING 3
29+
30+
Warning: Unexpected character in input: '%s' (ASCII=1) state=0 in %s on line %d
31+
T_OPEN_TAG 6
32+
T_BAD_CHARACTER 1
33+
T_WHITESPACE 1
34+
T_STRING 3
35+
36+
Warning: Unexpected character in input: '%s' (ASCII=1) state=0 in %s on line %d
37+
38+
Warning: Unexpected character in input: '%s' (ASCII=2) state=0 in %s on line %d
39+
40+
Warning: Unexpected character in input: '%s' (ASCII=3) state=0 in %s on line %d
41+
T_OPEN_TAG 6
42+
T_BAD_CHARACTER 1
43+
T_BAD_CHARACTER 1
44+
T_WHITESPACE 1
45+
T_STRING 3
46+
T_WHITESPACE 1
47+
T_BAD_CHARACTER 1

ext/tokenizer/tokenizer_data.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ void tokenizer_register_constants(INIT_FUNC_ARGS) {
162162
REGISTER_LONG_CONSTANT("T_NS_C", T_NS_C, CONST_CS | CONST_PERSISTENT);
163163
REGISTER_LONG_CONSTANT("T_NS_SEPARATOR", T_NS_SEPARATOR, CONST_CS | CONST_PERSISTENT);
164164
REGISTER_LONG_CONSTANT("T_ELLIPSIS", T_ELLIPSIS, CONST_CS | CONST_PERSISTENT);
165+
REGISTER_LONG_CONSTANT("T_BAD_CHARACTER", T_BAD_CHARACTER, CONST_CS | CONST_PERSISTENT);
165166
REGISTER_LONG_CONSTANT("T_DOUBLE_COLON", T_PAAMAYIM_NEKUDOTAYIM, CONST_CS | CONST_PERSISTENT);
166167
}
167168

@@ -304,6 +305,7 @@ char *get_token_type_name(int token_type)
304305
case T_NS_C: return "T_NS_C";
305306
case T_NS_SEPARATOR: return "T_NS_SEPARATOR";
306307
case T_ELLIPSIS: return "T_ELLIPSIS";
308+
case T_BAD_CHARACTER: return "T_BAD_CHARACTER";
307309

308310
}
309311
return "UNKNOWN";

0 commit comments

Comments
 (0)