Skip to content

Commit a14f6ad

Browse files
authored
Improve crashtracking payload and add build_id and relative address (#10469)
Improve crashtracking payload and add build_id and relative address spotbugs Add timeout to the build_id collecting thread Change file to path Update dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/HotspotCrashLogParser.java Co-authored-by: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com> Extract PDB guid and age as build_id for PE Add documentation for the threading model Move EMPTY inside BuildInfo Log if queue offer fails Merge branch 'master' into andrea.marziali/buildid remove import Merge branch 'master' into andrea.marziali/buildid Merge branch 'master' into andrea.marziali/buildid Co-authored-by: andrea.marziali <andrea.marziali@datadoghq.com>
1 parent 3600590 commit a14f6ad

23 files changed

Lines changed: 1326 additions & 80 deletions

dd-java-agent/agent-crashtracking/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ dependencies {
1616
implementation project(':internal-api')
1717
implementation project(':products:metrics:metrics-lib')
1818
implementation project(':utils:container-utils')
19+
implementation project(':utils:queue-utils')
1920
implementation project(':utils:version-utils')
2021
implementation project(path: ':dd-java-agent:ddprof-lib', configuration: 'shadow')
2122

@@ -25,6 +26,7 @@ dependencies {
2526
testImplementation libs.bundles.junit5
2627
testImplementation libs.bundles.mockito
2728
testImplementation libs.jackson.databind
29+
testImplementation libs.testcontainers
2830
testImplementation group: 'com.squareup.okhttp3', name: 'mockwebserver', version: libs.versions.okhttp.legacy.get()
2931
}
3032

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
package datadog.crashtracking.buildid;
2+
3+
import static datadog.crashtracking.buildid.BuildInfo.EMPTY;
4+
import static java.util.concurrent.TimeUnit.MILLISECONDS;
5+
import static java.util.concurrent.TimeUnit.SECONDS;
6+
7+
import datadog.common.queue.Queues;
8+
import datadog.trace.util.AgentTaskScheduler;
9+
import java.nio.file.Path;
10+
import java.util.HashSet;
11+
import java.util.Map;
12+
import java.util.Set;
13+
import java.util.concurrent.ConcurrentHashMap;
14+
import java.util.concurrent.CountDownLatch;
15+
import java.util.concurrent.TimeUnit;
16+
import java.util.concurrent.atomic.AtomicBoolean;
17+
import java.util.concurrent.locks.LockSupport;
18+
import org.jctools.queues.MessagePassingQueue;
19+
import org.slf4j.Logger;
20+
import org.slf4j.LoggerFactory;
21+
22+
/**
23+
* Collects build IDs from library files asynchronously.
24+
*
25+
* <h2>Threading Model</h2>
26+
*
27+
* This class follows a single-producer, single-consumer (SPSC) threading model:
28+
*
29+
* <ul>
30+
* <li><b>Producer (single-threaded):</b> The crash parsing flow calls {@link
31+
* #resolveBuildId(Path)} to enqueue libraries for processing. This method is guaranteed to be
32+
* called from a single thread.
33+
* <li><b>Consumer (single-threaded):</b> The background {@link Collector} thread processes the
34+
* work queue. Only one collector instance is ever started, enforced by {@code
35+
* collecting.compareAndSet(false, true)}.
36+
* </ul>
37+
*
38+
* <h2>Synchronization Strategy</h2>
39+
*
40+
* <ul>
41+
* <li><b>workQueue:</b> An SPSC (Single Producer Single Consumer) queue - thread-safe for one
42+
* producer and one consumer without additional synchronization.
43+
* <li><b>processed:</b> A plain {@link HashSet} - safe because it's only accessed from the
44+
* producer thread (crash parsing flow).
45+
* <li><b>libraryBuildInfo:</b> A {@link ConcurrentHashMap} - accessed from both producer
46+
* (removal) and consumer (insertion) threads, requires concurrent access.
47+
* <li><b>collecting:</b> An {@link AtomicBoolean} - coordinates lifecycle and ensures exactly one
48+
* collector is started.
49+
* </ul>
50+
*/
51+
public class BuildIdCollector {
52+
static final Logger LOGGER = LoggerFactory.getLogger(BuildIdCollector.class);
53+
54+
/** Thread-safe map: accessed by both producer and consumer threads. */
55+
private final Map<String, BuildInfo> libraryBuildInfo = new ConcurrentHashMap<>();
56+
57+
/** Tracks processed filenames. Only accessed from producer thread - no synchronization needed. */
58+
private final Set<String> processed = new HashSet<>();
59+
60+
/** Ensures exactly one collector thread is started. */
61+
private final AtomicBoolean collecting = new AtomicBoolean(false);
62+
63+
/** SPSC queue: one producer (crash parsing), one consumer (collector thread). */
64+
private final MessagePassingQueue<Path> workQueue = Queues.spscArrayQueue(Short.MAX_VALUE);
65+
66+
/** Signals when collection is complete. */
67+
private final CountDownLatch latch = new CountDownLatch(1);
68+
69+
/**
70+
* Consumer thread that processes the work queue and extracts build IDs.
71+
*
72+
* <p><b>Threading:</b> Runs in a single background thread. Only one instance is ever created,
73+
* guaranteed by the {@code collecting.compareAndSet(false, true)} check in {@link
74+
* #resolveBuildId(Path)}.
75+
*
76+
* <p>Polls the {@code workQueue} until either:
77+
*
78+
* <ul>
79+
* <li>The deadline is reached, or
80+
* <li>The {@code collecting} flag is set to false (via {@link #awaitCollectionDone(int)}) and
81+
* the queue is empty
82+
* </ul>
83+
*/
84+
class Collector implements Runnable {
85+
private final BuildIdExtractor extractor = BuildIdExtractor.create();
86+
private final long deadline;
87+
88+
Collector(long timeout, TimeUnit unit) {
89+
this.deadline = unit.toNanos(timeout) + System.nanoTime();
90+
}
91+
92+
@Override
93+
public void run() {
94+
while (System.nanoTime() <= deadline) {
95+
final Path path = workQueue.poll();
96+
if (path == null) {
97+
if (!collecting.get()) {
98+
break;
99+
}
100+
LockSupport.parkNanos(MILLISECONDS.toNanos(50));
101+
continue;
102+
}
103+
final String fileName = path.getFileName().toString();
104+
LOGGER.debug("Resolving build id for {} against {}", fileName, path);
105+
final String buildId = extractor.extractBuildId(path);
106+
if (buildId != null) {
107+
LOGGER.debug("Found build id {} for library {}", buildId, fileName);
108+
libraryBuildInfo.put(
109+
fileName, new BuildInfo(buildId, extractor.buildIdType(), extractor.fileType()));
110+
}
111+
}
112+
latch.countDown();
113+
}
114+
}
115+
116+
/**
117+
* Registers a library filename as needing build ID resolution.
118+
*
119+
* <p>Called from producer thread (crash parsing flow) before collection starts.
120+
*
121+
* @param filename the library filename to track
122+
*/
123+
public void addUnprocessedLibrary(String filename) {
124+
if (!collecting.get()) {
125+
libraryBuildInfo.putIfAbsent(filename, EMPTY);
126+
}
127+
}
128+
129+
/**
130+
* Enqueues a library path for build ID extraction.
131+
*
132+
* <p><b>Threading:</b> This method is called exclusively from the producer thread (crash parsing
133+
* flow). It starts the collector thread on first invocation and enqueues work items.
134+
*
135+
* <p>The {@code processed} set is only accessed here (producer thread), so no synchronization is
136+
* needed for it.
137+
*
138+
* @param path the path to the library file
139+
*/
140+
public void resolveBuildId(Path path) {
141+
if (collecting.compareAndSet(false, true)) {
142+
AgentTaskScheduler.get().execute(new Collector(5, SECONDS));
143+
}
144+
final String filename = path.getFileName().toString();
145+
if (!processed.add(filename)) {
146+
return;
147+
}
148+
if (libraryBuildInfo.remove(filename) == null) {
149+
// the library is not present in the collected ones part of the stackframe
150+
LOGGER.debug(
151+
"Skipping build id resolution for {} as it was not added to unprocessed", filename);
152+
153+
} else if (!workQueue.offer(path)) {
154+
LOGGER.warn(
155+
"Could not resolve the build id for library {} because the processing queue is full",
156+
path);
157+
}
158+
}
159+
160+
/**
161+
* Signals that no more work will be enqueued and waits for collection to complete.
162+
*
163+
* <p>Called from producer thread to stop collection and wait for the collector to finish
164+
* processing the queue.
165+
*
166+
* @param timeoutSeconds maximum time to wait for collection to complete
167+
*/
168+
public void awaitCollectionDone(final int timeoutSeconds) {
169+
if (!collecting.compareAndSet(true, false)) {
170+
return;
171+
}
172+
try {
173+
if (!latch.await(timeoutSeconds, SECONDS)) {
174+
LOGGER.warn("Build id collection incomplete.");
175+
}
176+
} catch (InterruptedException ie) {
177+
Thread.currentThread().interrupt();
178+
LOGGER.warn("Interrupted while waiting for build id collection to finish");
179+
}
180+
}
181+
182+
/**
183+
* Retrieves the build information for a library.
184+
*
185+
* <p>This method can be called from any thread after collection is complete. The {@link
186+
* ConcurrentHashMap} ensures thread-safe reads.
187+
*
188+
* @param filename the library filename
189+
* @return the build information, or null if not found
190+
*/
191+
public BuildInfo getBuildInfo(String filename) {
192+
return libraryBuildInfo.get(filename);
193+
}
194+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package datadog.crashtracking.buildid;
2+
3+
import datadog.environment.OperatingSystem;
4+
import java.nio.file.Path;
5+
6+
/**
7+
* Interface for extracting build IDs from native library binaries. Build IDs help identify exact
8+
* library versions for symbolization of native stack traces.
9+
*/
10+
public interface BuildIdExtractor {
11+
/**
12+
* Extracts build ID from a binary file.
13+
*
14+
* @param file Path to the library file
15+
* @return Build ID as hex string, or null if not found or on error
16+
*/
17+
String extractBuildId(Path file);
18+
19+
/**
20+
* @return the file type this extractor operates for.
21+
*/
22+
BuildInfo.FileType fileType();
23+
24+
/**
25+
* @return the build id type this extractor is able to provide.
26+
*/
27+
BuildInfo.BuildIdType buildIdType();
28+
29+
/**
30+
* Factory method that returns appropriate extractor for the platform.
31+
*
32+
* @return Platform-specific build ID extractor
33+
*/
34+
static BuildIdExtractor create() {
35+
if (OperatingSystem.isLinux()) {
36+
return new ElfBuildIdExtractor();
37+
} else if (OperatingSystem.isWindows()) {
38+
return new PeBuildIdExtractor();
39+
}
40+
return new NoOpBuildIdExtractor();
41+
}
42+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package datadog.crashtracking.buildid;
2+
3+
public class BuildInfo {
4+
public enum BuildIdType {
5+
GNU, // for ELF
6+
PDB // for DLL PE
7+
}
8+
9+
public enum FileType {
10+
ELF,
11+
PE,
12+
}
13+
14+
static final BuildInfo EMPTY = new BuildInfo(null, null, null);
15+
16+
public final String buildId;
17+
public final BuildIdType buildIdType;
18+
public final FileType fileType;
19+
20+
public BuildInfo(final String buildId, final BuildIdType buildIdType, final FileType fileType) {
21+
this.buildId = buildId;
22+
this.buildIdType = buildIdType;
23+
this.fileType = fileType;
24+
}
25+
}

0 commit comments

Comments
 (0)