Skip to content

Commit 3fdec93

Browse files
fmeumcopybara-github
authored andcommitted
Fix more Unicode bugs
* Use Latin-1 in many native file write rules for consistency with the internal encoding. * Use Latin-1 for the resolved repository file and the JSON profile. * Fix `unused_input_list` handling of non-ASCII characters in file names. * Flip the `legacy_utf8` parameter of `repository_ctx.file` to `False` and make it a no-op. With the previous default, any non-ASCII characters would be written out as double encoded UTF-8, which is not a useful choice. * Change `repository_ctx.template` to operate on raw bytes for consistency with `repository_ctx.read` and to fix substitution with non-ASCII keys/values. * Move some usages of `UTF_8` closer to their usage site to clarify why they are correct. * Fixes parsing of dependency files with Unicode character contents (`/showIncludes` and `.d` files) Closes #24182. PiperOrigin-RevId: 698111811 Change-Id: Ie43bab9eb5963bf81690dd8985d358f544a711c9
1 parent c6148b5 commit 3fdec93

43 files changed

Lines changed: 351 additions & 334 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/main/java/com/google/devtools/build/lib/analysis/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,6 +1482,7 @@ java_library(
14821482
"//src/main/java/com/google/devtools/build/lib/actions:artifacts",
14831483
"//src/main/java/com/google/devtools/build/lib/actions:commandline_item",
14841484
"//src/main/java/com/google/devtools/build/lib/collect/nestedset",
1485+
"//src/main/java/com/google/devtools/build/lib/unsafe:string",
14851486
"//src/main/java/com/google/devtools/build/lib/util",
14861487
"//src/main/java/net/starlark/java/eval",
14871488
"//third_party:jsr305",
@@ -1498,6 +1499,7 @@ java_library(
14981499
"//src/main/java/com/google/devtools/build/lib/actions:artifact_expander",
14991500
"//src/main/java/com/google/devtools/build/lib/actions:artifacts",
15001501
"//src/main/java/com/google/devtools/build/lib/collect/nestedset",
1502+
"//src/main/java/com/google/devtools/build/lib/unsafe:string",
15011503
"//src/main/java/com/google/devtools/build/lib/util",
15021504
"//third_party:guava",
15031505
"//third_party:jsr305",

src/main/java/com/google/devtools/build/lib/analysis/ConfiguredRuleClassProvider.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
import com.google.devtools.build.lib.packages.RuleTransitionData;
5454
import com.google.devtools.build.lib.packages.WorkspaceFactory;
5555
import com.google.devtools.build.lib.starlarkbuildapi.core.Bootstrap;
56+
import com.google.devtools.build.lib.unsafe.StringUnsafe;
5657
import com.google.devtools.build.lib.vfs.DigestHashFunction;
5758
import com.google.devtools.build.lib.vfs.Path;
5859
import com.google.devtools.build.lib.vfs.PathFragment;
@@ -131,7 +132,10 @@ protected synchronized byte[] getFastDigest(PathFragment path) {
131132

132133
@Override
133134
protected synchronized byte[] getDigest(PathFragment path) {
134-
return getDigestFunction().getHashFunction().hashString(path.toString(), UTF_8).asBytes();
135+
return getDigestFunction()
136+
.getHashFunction()
137+
.hashBytes(StringUnsafe.getInstance().getInternalStringBytes(path.getPathString()))
138+
.asBytes();
135139
}
136140
}
137141

src/main/java/com/google/devtools/build/lib/analysis/RepoMappingManifestAction.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import static com.google.common.collect.ImmutableList.toImmutableList;
1717
import static com.google.common.collect.ImmutableSortedMap.toImmutableSortedMap;
1818
import static java.nio.charset.StandardCharsets.ISO_8859_1;
19-
import static java.nio.charset.StandardCharsets.UTF_8;
2019
import static java.util.Comparator.comparing;
2120

2221
import com.github.benmanes.caffeine.cache.Caffeine;
@@ -146,7 +145,7 @@ protected void computeKey(
146145
public String getFileContents(@Nullable EventHandler eventHandler) throws IOException {
147146
ByteArrayOutputStream stream = new ByteArrayOutputStream();
148147
newDeterministicWriter().writeOutputFile(stream);
149-
return stream.toString(UTF_8);
148+
return stream.toString(ISO_8859_1);
150149
}
151150

152151
@Override

src/main/java/com/google/devtools/build/lib/analysis/SourceManifestAction.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
import static com.google.common.collect.ImmutableList.toImmutableList;
1717
import static java.nio.charset.StandardCharsets.ISO_8859_1;
18-
import static java.nio.charset.StandardCharsets.UTF_8;
1918

2019
import com.google.common.annotations.VisibleForTesting;
2120
import com.google.common.collect.ImmutableList;
@@ -218,7 +217,7 @@ public void writeOutputFile(OutputStream out, @Nullable EventHandler eventHandle
218217
public String getFileContents(@Nullable EventHandler eventHandler) throws IOException {
219218
ByteArrayOutputStream stream = new ByteArrayOutputStream();
220219
writeOutputFile(stream, eventHandler);
221-
return stream.toString(UTF_8);
220+
return stream.toString(ISO_8859_1);
222221
}
223222

224223
@Override

src/main/java/com/google/devtools/build/lib/analysis/actions/LazyWriteNestedSetOfTupleAction.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
package com.google.devtools.build.lib.analysis.actions;
1616

17-
import static java.nio.charset.StandardCharsets.UTF_8;
1817

1918
import com.google.devtools.build.lib.actions.ActionExecutionContext;
2019
import com.google.devtools.build.lib.actions.ActionKeyContext;
@@ -25,6 +24,7 @@
2524
import com.google.devtools.build.lib.collect.nestedset.NestedSet;
2625
import com.google.devtools.build.lib.collect.nestedset.NestedSetBuilder;
2726
import com.google.devtools.build.lib.collect.nestedset.Order;
27+
import com.google.devtools.build.lib.unsafe.StringUnsafe;
2828
import com.google.devtools.build.lib.util.Fingerprint;
2929
import javax.annotation.Nullable;
3030
import net.starlark.java.eval.Tuple;
@@ -49,7 +49,8 @@ public LazyWriteNestedSetOfTupleAction(
4949

5050
@Override
5151
public DeterministicWriter newDeterministicWriter(ActionExecutionContext ctx) {
52-
return out -> out.write(getContents(delimiter).getBytes(UTF_8));
52+
return out ->
53+
out.write(StringUnsafe.getInstance().getInternalStringBytes(getContents(delimiter)));
5354
}
5455

5556
/** Computes the Action key for this action by computing the fingerprint for the file contents. */

src/main/java/com/google/devtools/build/lib/analysis/actions/LazyWritePathsFileAction.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
package com.google.devtools.build.lib.analysis.actions;
1717

18-
import static java.nio.charset.StandardCharsets.UTF_8;
1918

2019
import com.google.common.collect.ImmutableSet;
2120
import com.google.devtools.build.lib.actions.ActionExecutionContext;
@@ -26,6 +25,7 @@
2625
import com.google.devtools.build.lib.collect.nestedset.NestedSet;
2726
import com.google.devtools.build.lib.collect.nestedset.NestedSetBuilder;
2827
import com.google.devtools.build.lib.collect.nestedset.Order;
28+
import com.google.devtools.build.lib.unsafe.StringUnsafe;
2929
import com.google.devtools.build.lib.util.Fingerprint;
3030
import java.util.function.Function;
3131
import javax.annotation.Nullable;
@@ -71,7 +71,7 @@ public LazyWritePathsFileAction(
7171

7272
@Override
7373
public DeterministicWriter newDeterministicWriter(ActionExecutionContext ctx) {
74-
return out -> out.write(getContents().getBytes(UTF_8));
74+
return out -> out.write(StringUnsafe.getInstance().getInternalStringBytes(getContents()));
7575
}
7676

7777
/** Computes the Action key for this action by computing the fingerprint for the file contents. */

src/main/java/com/google/devtools/build/lib/analysis/actions/LocalTemplateExpansionStrategy.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
package com.google.devtools.build.lib.analysis.actions;
1616

17-
import static java.nio.charset.StandardCharsets.UTF_8;
17+
import static java.nio.charset.StandardCharsets.ISO_8859_1;
1818

1919
import com.google.common.collect.ImmutableList;
2020
import com.google.devtools.build.lib.actions.AbstractAction;
@@ -47,7 +47,8 @@ public ImmutableList<SpawnResult> expandTemplate(
4747
final String expandedTemplate =
4848
getExpandedTemplateUnsafe(
4949
templateMetadata.template(), templateMetadata.substitutions(), ctx.getPathResolver());
50-
DeterministicWriter deterministicWriter = out -> out.write(expandedTemplate.getBytes(UTF_8));
50+
DeterministicWriter deterministicWriter =
51+
out -> out.write(expandedTemplate.getBytes(ISO_8859_1));
5152
return ctx.getContext(FileWriteActionContext.class)
5253
.writeOutputToFile(
5354
action,

src/main/java/com/google/devtools/build/lib/analysis/actions/StarlarkAction.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// limitations under the License.
1414
package com.google.devtools.build.lib.analysis.actions;
1515

16-
import static java.nio.charset.StandardCharsets.UTF_8;
16+
import static java.nio.charset.StandardCharsets.ISO_8859_1;
1717

1818
import com.google.common.annotations.VisibleForTesting;
1919
import com.google.common.collect.ImmutableMap;
@@ -349,10 +349,13 @@ protected void afterExecute(
349349
for (Artifact input : allInputs.toList()) {
350350
usedInputsByMappedPath.put(pathMapper.getMappedExecPathString(input), input);
351351
}
352+
// Bazel encodes file system paths as raw bytes stored in a Latin-1 encoded string, so we need
353+
// to make sure to also decode the unused input list as Latin-1.
352354
try (BufferedReader br =
353355
new BufferedReader(
354356
new InputStreamReader(
355-
getUnusedInputListInputStream(actionExecutionContext, spawnResults), UTF_8))) {
357+
getUnusedInputListInputStream(actionExecutionContext, spawnResults),
358+
ISO_8859_1))) {
356359
String line;
357360
while ((line = br.readLine()) != null) {
358361
line = line.trim();

src/main/java/com/google/devtools/build/lib/analysis/actions/Template.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
package com.google.devtools.build.lib.analysis.actions;
1616

17+
import static java.nio.charset.StandardCharsets.ISO_8859_1;
18+
1719
import com.google.common.annotations.VisibleForTesting;
1820
import com.google.devtools.build.lib.actions.Artifact;
1921
import com.google.devtools.build.lib.actions.ArtifactPathResolver;
@@ -22,16 +24,12 @@
2224
import com.google.devtools.build.lib.vfs.FileSystemUtils;
2325
import com.google.devtools.build.lib.vfs.Path;
2426
import java.io.IOException;
25-
import java.nio.charset.Charset;
26-
import java.nio.charset.StandardCharsets;
2727
import javax.annotation.Nullable;
2828

2929
/** A template that contains text content, or alternatively throws an {@link IOException}. */
3030
@Immutable // all subclasses are immutable
3131
public abstract class Template {
3232

33-
static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;
34-
3533
/** We only allow subclasses in this file. */
3634
private Template() {}
3735

@@ -105,7 +103,8 @@ private static final class ArtifactTemplate extends Template {
105103
public String getContent(ArtifactPathResolver resolver) throws IOException {
106104
Path templatePath = resolver.toPath(templateArtifact);
107105
try {
108-
return FileSystemUtils.readContent(templatePath, DEFAULT_CHARSET);
106+
// Bazel's internal encoding for strings is raw bytes as Latin-1
107+
return FileSystemUtils.readContent(templatePath, ISO_8859_1);
109108
} catch (IOException e) {
110109
throw new IOException(
111110
"failed to load template file '"

src/main/java/com/google/devtools/build/lib/analysis/starlark/StarlarkActionFactory.java

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -977,13 +977,8 @@ public void expandTemplate(
977977
ImmutableMap.Builder<String, Substitution> substitutionsBuilder = ImmutableMap.builder();
978978
for (Map.Entry<String, String> substitution :
979979
Dict.cast(substitutionsUnchecked, String.class, String.class, "substitutions").entrySet()) {
980-
// Blaze calls ParserInput.fromLatin1 when reading BUILD files, which might
981-
// contain UTF-8 encoded symbols as part of template substitution.
982-
// As a quick fix, the substitution values are corrected before being passed on.
983-
// In the long term, avoiding ParserInput.fromLatin would be a better approach.
984980
substitutionsBuilder.put(
985-
substitution.getKey(),
986-
Substitution.of(substitution.getKey(), convertLatin1ToUtf8(substitution.getValue())));
981+
substitution.getKey(), Substitution.of(substitution.getKey(), substitution.getValue()));
987982
}
988983
if (!Starlark.UNBOUND.equals(computedSubstitutions)) {
989984
for (Substitution substitution : ((TemplateDict) computedSubstitutions).getAll()) {
@@ -1007,16 +1002,6 @@ public void expandTemplate(
10071002
registerAction(action);
10081003
}
10091004

1010-
/**
1011-
* Returns the proper UTF-8 representation of a String that was erroneously read using Latin1.
1012-
*
1013-
* @param latin1 Input string
1014-
* @return The input string, UTF8 encoded
1015-
*/
1016-
private static String convertLatin1ToUtf8(String latin1) {
1017-
return new String(latin1.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
1018-
}
1019-
10201005
@Override
10211006
public Args args(StarlarkThread thread) {
10221007
return Args.newArgs(thread.mutability(), getSemantics());

0 commit comments

Comments
 (0)