Add Protocol Buffer Text Format Language (txtpb) (#1165)

Simran-B · web-flow · commit 8b8574c4d350 · 2025-11-26T22:07:46.000+11:00
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ translators for Pygments lexers and styles.
 |   M    | Makefile, Mako, markdown, Mason, Materialize SQL dialect, Mathematica, Matlab, MCFunction, Meson, Metal, MiniZinc, MLIR, Modula-2, Mojo, MonkeyC, MoonScript, MorrowindScript, Myghty, MySQL
 |   N    | NASM, Natural, NDISASM, Newspeak, Nginx configuration file, Nim, Nix, NSIS, Nu
 |   O    | Objective-C, ObjectPascal, OCaml, Octave, Odin, OnesEnterprise, OpenEdge ABL, OpenSCAD, Org Mode
-|   P    | PacmanConf, Perl, PHP, PHTML, Pig, PkgConfig, PL/pgSQL, plaintext, Plutus Core, Pony, PostgreSQL SQL dialect, PostScript, POVRay, PowerQuery, PowerShell, Prolog, Promela, PromQL, properties, Protocol Buffer, PRQL, PSL, Puppet, Python, Python 2
+|   P    | PacmanConf, Perl, PHP, PHTML, Pig, PkgConfig, PL/pgSQL, plaintext, Plutus Core, Pony, PostgreSQL SQL dialect, PostScript, POVRay, PowerQuery, PowerShell, Prolog, Promela, PromQL, properties, Protocol Buffer, Protocol Buffer Text Format, PRQL, PSL, Puppet, Python, Python 2
 |   Q    | QBasic, QML
 |   R    | R, Racket, Ragel, Raku, react, ReasonML, reg, Rego, reStructuredText, Rexx, RPGLE, RPMSpec, Ruby, Rust
 |   S    | SAS, Sass, Scala, Scheme, Scilab, SCSS, Sed, Sieve, Smali, Smalltalk, Smarty, SNBT, Snobol, Solidity, SourcePawn, SPARQL, SQL, SquidConf, Standard ML, stas, Stylus, Svelte, Swift, SYSTEMD, systemverilog
diff --git a/lexers/embedded/txtpb.xml b/lexers/embedded/txtpb.xml
@@ -0,0 +1,162 @@
+<lexer>
+  <config>
+    <name>Protocol Buffer Text Format</name>
+    <alias>txtpb</alias>
+    <filename>*.txtpb</filename>
+    <filename>*.textproto</filename>
+    <filename>*.textpb</filename>
+    <filename>*.pbtxt</filename>
+    <mime_type>application/x-protobuf-text</mime_type>
+    <case_insensitive>false</case_insensitive>
+    <dot_all>false</dot_all>
+    <ensure_nl>true</ensure_nl>
+  </config>
+  <rules>
+    <state name="double-quote">
+      <rule pattern="\\[abfnrtv\\\?&#39;&#34;]">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\[0-7]{1,3}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\x[0-9a-fA-F]{1,2}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\u[0-9a-fA-F]{4}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\U000[0-9a-fA-F]{5}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\U0010[0-9a-fA-F]{4}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="[^&#34;\\]+">
+        <token type="LiteralStringDouble"/>
+      </rule>
+      <rule pattern="&#34;">
+        <token type="LiteralStringDouble"/>
+        <pop depth="1"/>
+      </rule>
+    </state>
+
+    <state name="single-quote">
+      <rule pattern="\\[abfnrtv\\\?&#39;&#34;]">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\[0-7]{1,3}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\x[0-9a-fA-F]{1,2}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\u[0-9a-fA-F]{4}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\U000[0-9a-fA-F]{5}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="\\U0010[0-9a-fA-F]{4}">
+        <token type="LiteralStringEscape"/>
+      </rule>
+      <rule pattern="[^&#39;\\]+">
+        <token type="LiteralStringSingle"/>
+      </rule>
+      <rule pattern="&#39;">
+        <token type="LiteralStringSingle"/>
+        <pop depth="1"/>
+      </rule>
+    </state>
+
+    <state name="root">
+      <!-- Comments -->
+      <rule pattern="#.*\n">
+        <token type="CommentSingle"/>
+      </rule>
+
+      <!-- Whitespace -->
+      <rule pattern="[ \n\t\v\f\r]+">
+        <token type="Text"/>
+      </rule>
+
+      <!-- Operators -->
+      <rule pattern="-">
+        <token type="Operator" />
+      </rule>
+
+      <!-- Special float literals -->
+      <rule pattern="(?i)(?:inf|infinity)\b">
+        <token type="LiteralNumberFloat"/>
+      </rule>
+      <rule pattern="(?i)nan\b">
+        <token type="LiteralNumberFloat"/>
+      </rule>
+
+      <!-- Float literals with suffix (must come before decimal integers) -->
+      <rule pattern="(?:0|[1-9][0-9]*)[fF]">
+        <token type="LiteralNumberFloat"/>
+      </rule>
+
+      <!-- Float literals -->
+      <rule pattern="\.[0-9]+(?:[eE][+-]?[0-9]+)?[fF]?">
+        <token type="LiteralNumberFloat"/>
+      </rule>
+      <rule pattern="(?:0|[1-9][0-9]*)\.[0-9]*(?:[eE][+-]?[0-9]+)?[fF]?">
+        <token type="LiteralNumberFloat"/>
+      </rule>
+      <rule pattern="(?:0|[1-9][0-9]*)[eE][+-]?[0-9]+[fF]?">
+        <token type="LiteralNumberFloat"/>
+      </rule>
+      
+      <!-- Hexadecimal integers -->
+      <rule pattern="0[xX][0-9a-fA-F]+">
+        <token type="LiteralNumberHex"/>
+      </rule>
+      
+      <!-- Octal integers -->
+      <rule pattern="0[0-7]+">
+        <token type="LiteralNumberOct"/>
+      </rule>
+      
+      <!-- Decimal integers -->
+      <rule pattern="(?:0|[1-9][0-9]*)">
+        <token type="LiteralNumberInteger"/>
+      </rule>
+      
+      <!-- Boolean keywords -->
+      <rule pattern="\b(?:[Tt]rue|[Ff]alse|t|f)\b">
+        <token type="KeywordConstant"/>
+      </rule>
+      
+      <!-- Strings -->
+      <rule pattern="&#34;">
+        <token type="LiteralStringDouble"/>
+        <push state="double-quote"/>
+      </rule>
+      <rule pattern="&#39;">
+        <token type="LiteralStringSingle"/>
+        <push state="single-quote"/>
+      </rule>
+      
+      <!-- Qualified names (with dots) for field paths and type URLs -->
+      <rule pattern="[a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)+">
+        <token type="NameNamespace"/>
+      </rule>
+      
+      <!-- Field names and identifiers (including enum values) -->
+      <rule pattern="[a-zA-Z_][a-zA-Z0-9_]*">
+        <token type="Name"/>
+      </rule>
+      
+      <!-- URL paths in type URLs for Any expansion -->
+      <rule pattern="/[a-zA-Z_][a-zA-Z0-9_/.]*">
+        <token type="NameNamespace"/>
+      </rule>
+      
+      <!-- Punctuation -->
+      <rule pattern="[:;,&lt;&gt;\[\]{}]">
+        <token type="Punctuation"/>
+      </rule>
+    </state>
+  </rules>
+</lexer>
diff --git a/lexers/testdata/txtpb.actual b/lexers/testdata/txtpb.actual
@@ -0,0 +1,135 @@
+# proto-file: example/test.proto
+# proto-message: TestMessage
+
+# Basic field assignments
+name: "John Smith"
+age: 42
+email: 'user@example.com'
+
+# Numeric literals
+decimal_value: 12345
+octal_value: 0755
+hex_value: 0xDEADBEEF
+zero: 0
+
+# Negative numbers (- is a separate token)
+negative_int: -42
+negative_float: -3.14159
+negative_with_space: - 100
+negative_scientific: -54.32E+1f
+
+# Float literals
+pi: 3.14159
+scientific: 1.23e-10
+with_suffix: 42f
+decimal_suffix: 100F
+exp_notation: 2e5
+leading_dot: .5
+trailing_dot: 10.
+optional_suffix: 1.0f
+
+# Special float values
+positive_inf: inf
+negative_inf: -infinity
+not_a_number: NaN
+case_insensitive: INFINITY
+
+# Boolean values
+is_active: true
+is_deleted: False
+short_true: t
+short_false: f
+# 0, 1, 0x0, 0x1, etc. are also valid but not bools lexically
+
+# String escapes
+escaped_string: "line1\nline2\ttab\r\nwindows\\backslash"
+double_quote_escape: "He said \"hello\", 'single' quote"
+single_quote_escape: 'She said \'good bye\', "double" quote'
+octal_escape: "\0\18\22\3334"
+hex_escape: "\x0\x1Hello\x42"
+unicode_escape: "\u0041\u0042"
+unicode_long: "\U0001f951\U0010FFFD"
+
+# Message values with colon delimiter
+nested_message: {
+  field1: "value1"
+  field2: 123
+}
+
+# Message values with angle brackets
+another_message <
+  inner_field: true
+>
+
+# Repeated fields
+tags: "tag1"
+tags: "tag2"
+tags: "tag3"
+
+# List syntax for repeated fields
+numbers: [1, 2, 3, 4, 5]
+strings: ["first", "second", "third"]
+
+# Map entries
+metadata { key: "version" value: "1.0" }
+metadata { key: "author" value: "test" }
+metadata: [
+  { key: "language" value: "proto" },
+  { key: "year" value: "2024" }
+]
+
+# Extension fields
+[com.example.custom_field]: "extension_value"
+[org.example.number_ext]: 999
+
+# Any expansion
+details {
+  [type.googleapis.com/example.PersonDetails] {
+    phone: "555-1234"
+    address: "123 Main St"
+  }
+}
+
+# Qualified field names
+config.server.host: "localhost"
+config.server.port: 8080
+
+# Group field (capitalized name)
+MyGroup {
+  my_value: 42
+}
+
+# Enum values by name
+status: ACTIVE
+priority: HIGH
+color: RED
+
+# Enum values by number
+fallback_status: 1
+
+# Edge cases
+empty_string: ""
+empty_message: {}
+empty_list: []
+
+# Multiple nested messages
+outer {
+  middle {
+    inner {
+      deep_field: "nested"
+    }
+  }
+}
+
+# Mixed delimiters
+mixed: { field1: "a" } next: < field2: "b" >
+
+# Semicolon separators (optional)
+with_semi: 1; another: 2;
+
+# Concatenated strings
+long_string: "This is a very long string that "
+             "can be split across multiple lines "
+             "for better readability"
+
+field_with_comment: 42  # Inline comment (and no new line, should be handled by ensure_nl=true)
diff --git a/lexers/testdata/txtpb.expected b/lexers/testdata/txtpb.expected