Skip to content

Commit 9fb399a

Browse files
committed
refactor(analyzer): replace pre-scan with interval-based noise floor estimation
- Remove preScanNoiseFloor() function (~130 lines) that redundantly read 15% of file before Pass 1 - Add estimateNoiseFloorAndThreshold() to compute noise floor from first 15% of interval samples collected during Pass 1 - Refactor calculateSilenceThresholdFromIntervals() as wrapper for new function - Rename constants: preScan* → silence* to reflect new implementation - Update log message from "pre-scan" to "noise floor estimate" The interval data already contained all necessary RMS measurements; the separate file read was wasted I/O. All measurements remain identical.
1 parent 47478b4 commit 9fb399a

2 files changed

Lines changed: 56 additions & 171 deletions

File tree

internal/logging/report.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1684,7 +1684,7 @@ func writeDiagnosticSilence(f *os.File, measurements *processor.AudioMeasurement
16841684

16851685
// Show adaptive silence detection threshold if different from default
16861686
if measurements.SilenceDetectLevel != 0 && measurements.SilenceDetectLevel != -50.0 {
1687-
fmt.Fprintf(f, "Silence Threshold: %.1f dB (adaptive from %.1f dB pre-scan)\n",
1687+
fmt.Fprintf(f, "Silence Threshold: %.1f dB (from %.1f dB noise floor estimate)\n",
16881688
measurements.SilenceDetectLevel, measurements.PreScanNoiseFloor)
16891689
}
16901690

internal/processor/analyzer.go

Lines changed: 55 additions & 170 deletions
Original file line numberDiff line numberDiff line change
@@ -670,18 +670,19 @@ func roomToneScore(interval IntervalSample, rmsP50, fluxP50 float64) float64 {
670670
return roomToneAmplitudeWeight*amplitudeScore + roomToneFluxWeight*fluxScore
671671
}
672672

673-
// calculateSilenceThresholdFromIntervals derives the silence threshold from interval data.
674-
// Uses spectral analysis to identify room tone by its characteristic stability and quietness.
673+
// estimateNoiseFloorAndThreshold analyses interval data to estimate noise floor and silence threshold.
674+
// Returns (noiseFloor, silenceThreshold, ok). If ok is false, fallback values should be used.
675675
//
676-
// Key insight: room tone detection should use behavioral characteristics, not just amplitude:
676+
// Uses spectral analysis to identify room tone by its characteristic stability and quietness:
677677
// 1. Room tone is quieter than speech (but may overlap with quiet speech)
678678
// 2. Room tone has low spectral flux (stable, unchanging)
679679
// 3. Room tone has consistent spectral characteristics
680680
//
681-
// We compute a "room tone score" for each interval and use that to find the threshold.
682-
func calculateSilenceThresholdFromIntervals(intervals []IntervalSample, fallbackThreshold float64) float64 {
681+
// The noise floor is the max RMS of high-confidence room tone intervals.
682+
// The silence threshold adds headroom to the noise floor for detection margin.
683+
func estimateNoiseFloorAndThreshold(intervals []IntervalSample) (noiseFloor, silenceThreshold float64, ok bool) {
683684
if len(intervals) < silenceThresholdMinIntervals {
684-
return fallbackThreshold
685+
return 0, 0, false
685686
}
686687

687688
// Only use the first silenceSearchPercent% of intervals for threshold calculation
@@ -734,16 +735,26 @@ func calculateSilenceThresholdFromIntervals(intervals []IntervalSample, fallback
734735
candidateCount = len(scored)
735736
}
736737

737-
// Threshold is the maximum RMS among high-confidence room tone intervals
738-
// Add small headroom to catch edge cases
738+
// Noise floor is the maximum RMS among high-confidence room tone intervals
739739
maxRoomToneRMS := -120.0
740740
for i := 0; i < candidateCount; i++ {
741741
if scored[i].rms > maxRoomToneRMS {
742742
maxRoomToneRMS = scored[i].rms
743743
}
744744
}
745745

746-
return maxRoomToneRMS + silenceThresholdHeadroomDB
746+
return maxRoomToneRMS, maxRoomToneRMS + silenceThresholdHeadroomDB, true
747+
}
748+
749+
// calculateSilenceThresholdFromIntervals derives the silence threshold from interval data.
750+
// This is a convenience wrapper around estimateNoiseFloorAndThreshold for callers that only
751+
// need the threshold value.
752+
func calculateSilenceThresholdFromIntervals(intervals []IntervalSample, fallbackThreshold float64) float64 {
753+
_, threshold, ok := estimateNoiseFloorAndThreshold(intervals)
754+
if !ok {
755+
return fallbackThreshold
756+
}
757+
return threshold
747758
}
748759

749760
// findSilenceCandidatesFromIntervals identifies silence regions from interval samples.
@@ -1443,8 +1454,8 @@ type AudioMeasurements struct {
14431454
TargetOffset float64 `json:"target_offset"` // Offset for normalization
14441455
NoiseFloor float64 `json:"noise_floor"` // Measured noise floor from astats (dBFS)
14451456

1446-
// Pre-scan adaptive silence detection thresholds
1447-
PreScanNoiseFloor float64 `json:"prescan_noise_floor"` // Noise floor estimated from first 15% of audio (dBFS)
1457+
// Adaptive silence detection thresholds (derived from interval sampling)
1458+
PreScanNoiseFloor float64 `json:"prescan_noise_floor"` // Noise floor estimated from first 15% of intervals (dBFS)
14481459
SilenceDetectLevel float64 `json:"silence_detect_level"` // Adaptive silencedetect threshold used (dBFS)
14491460

14501461
// Silence detection results (derived from interval sampling)
@@ -1725,165 +1736,36 @@ func finalizeOutputMeasurements(acc *outputMetadataAccumulators) *OutputMeasurem
17251736
return m
17261737
}
17271738

1728-
// Pre-scan constants
1739+
// Threshold bounds for adaptive silence detection
17291740
const (
1730-
// preScanPercent is the fraction of audio to scan for adaptive threshold detection.
1731-
// We scan the first 15% of the recording to estimate the noise floor before running
1732-
// the full Pass 1 analysis. This allows silencedetect to use an adaptive threshold.
1733-
preScanPercent = 0.15
1734-
1735-
// preScanSilenceHeadroom is added to the pre-scan noise floor to get the silencedetect threshold.
1741+
// silenceFallbackHeadroom is added to the noise floor to get the silencedetect threshold.
17361742
// A region is considered "silence" if it's within this headroom of the noise floor.
17371743
// Higher values detect more silence (including quieter room tone) but may include crosstalk.
1738-
preScanSilenceHeadroom = 6.0 // dB
1744+
silenceFallbackHeadroom = 6.0 // dB
17391745

1740-
// preScanMinThreshold prevents silencedetect from being too sensitive in very quiet recordings.
1746+
// silenceMinThreshold prevents silencedetect from being too sensitive in very quiet recordings.
17411747
// Even professional recordings rarely have silence below -70 dBFS.
1742-
preScanMinThreshold = -70.0
1748+
silenceMinThreshold = -70.0
17431749

1744-
// preScanMaxThreshold prevents silencedetect from detecting loud sections as silence.
1750+
// silenceMaxThreshold prevents silencedetect from detecting loud sections as silence.
17451751
// If the estimated threshold is above this, something is wrong with the recording.
1746-
preScanMaxThreshold = -35.0
1752+
silenceMaxThreshold = -35.0
17471753
)
17481754

1749-
// preScanNoiseFloor performs a quick scan of the first 15% of the audio to estimate
1750-
// the noise floor. This is used to set an adaptive silencedetect threshold for Pass 1.
1751-
//
1752-
// The function runs astats on the initial portion of audio and returns the RMSTrough
1753-
// measurement, which represents the level of the quietest segments (likely ambient noise).
1754-
//
1755-
// Returns the estimated noise floor in dBFS, or an error if the scan fails.
1756-
// If RMSTrough is not available, falls back to RMSLevel - 15dB.
1757-
func preScanNoiseFloor(filename string) (float64, error) {
1758-
// Open audio file
1759-
reader, metadata, err := audio.OpenAudioFile(filename)
1760-
if err != nil {
1761-
return 0, fmt.Errorf("pre-scan: failed to open audio file: %w", err)
1762-
}
1763-
defer reader.Close()
1764-
1765-
// Calculate how many frames to process (15% of audio)
1766-
totalDuration := metadata.Duration
1767-
scanDuration := totalDuration * preScanPercent
1768-
sampleRate := float64(metadata.SampleRate)
1769-
samplesPerFrame := 4096.0
1770-
maxFrames := int((scanDuration * sampleRate) / samplesPerFrame)
1771-
1772-
// Create a simple filter graph with just downmix and astats
1773-
// We need mono for consistent measurements, and astats for noise floor detection
1774-
filterSpec := "aformat=channel_layouts=mono,astats=metadata=1:measure_overall=Noise_floor+RMS_level+RMS_trough+Peak_level:measure_perchannel=0"
1775-
1776-
filterGraph, bufferSrcCtx, bufferSinkCtx, err := setupFilterGraph(
1777-
reader.GetDecoderContext(),
1778-
filterSpec,
1779-
)
1780-
if err != nil {
1781-
return 0, fmt.Errorf("pre-scan: failed to create filter graph: %w", err)
1782-
}
1783-
defer ffmpeg.AVFilterGraphFree(&filterGraph)
1784-
1785-
filteredFrame := ffmpeg.AVFrameAlloc()
1786-
defer ffmpeg.AVFrameFree(&filteredFrame)
1787-
1788-
// Track measurements
1789-
var rmsTrough, rmsLevel float64
1790-
frameCount := 0
1791-
1792-
// Process frames until we've scanned 15% of the audio
1793-
for frameCount < maxFrames {
1794-
frame, err := reader.ReadFrame()
1795-
if err != nil {
1796-
return 0, fmt.Errorf("pre-scan: failed to read frame: %w", err)
1797-
}
1798-
if frame == nil {
1799-
break // EOF
1800-
}
1801-
1802-
// Push frame into filter graph
1803-
if _, err := ffmpeg.AVBuffersrcAddFrameFlags(bufferSrcCtx, frame, 0); err != nil {
1804-
return 0, fmt.Errorf("pre-scan: failed to add frame to filter: %w", err)
1805-
}
1806-
frameCount++
1807-
1808-
// Pull filtered frames and extract astats metadata
1809-
for {
1810-
if _, err := ffmpeg.AVBuffersinkGetFrame(bufferSinkCtx, filteredFrame); err != nil {
1811-
if errors.Is(err, ffmpeg.EAgain) || errors.Is(err, ffmpeg.AVErrorEOF) {
1812-
break
1813-
}
1814-
return 0, fmt.Errorf("pre-scan: failed to get filtered frame: %w", err)
1815-
}
1816-
1817-
// Extract astats measurements from metadata
1818-
if metadata := filteredFrame.Metadata(); metadata != nil {
1819-
if value, ok := getFloatMetadata(metadata, metaKeyRMSTrough); ok {
1820-
rmsTrough = value
1821-
}
1822-
if value, ok := getFloatMetadata(metadata, metaKeyRMSLevel); ok {
1823-
rmsLevel = value
1824-
}
1825-
}
1826-
1827-
ffmpeg.AVFrameUnref(filteredFrame)
1828-
}
1829-
}
1830-
1831-
// Flush the filter graph
1832-
if _, err := ffmpeg.AVBuffersrcAddFrameFlags(bufferSrcCtx, nil, 0); err != nil {
1833-
return 0, fmt.Errorf("pre-scan: failed to flush filter: %w", err)
1834-
}
1835-
1836-
// Pull remaining frames
1837-
for {
1838-
if _, err := ffmpeg.AVBuffersinkGetFrame(bufferSinkCtx, filteredFrame); err != nil {
1839-
if errors.Is(err, ffmpeg.EAgain) || errors.Is(err, ffmpeg.AVErrorEOF) {
1840-
break
1841-
}
1842-
return 0, fmt.Errorf("pre-scan: failed to get remaining frame: %w", err)
1843-
}
1844-
1845-
if metadata := filteredFrame.Metadata(); metadata != nil {
1846-
if value, ok := getFloatMetadata(metadata, metaKeyRMSTrough); ok {
1847-
rmsTrough = value
1848-
}
1849-
if value, ok := getFloatMetadata(metadata, metaKeyRMSLevel); ok {
1850-
rmsLevel = value
1851-
}
1852-
}
1853-
1854-
ffmpeg.AVFrameUnref(filteredFrame)
1855-
}
1856-
1857-
// Determine noise floor from measurements
1858-
var noiseFloor float64
1859-
if rmsTrough != 0 && !math.IsInf(rmsTrough, -1) {
1860-
// Primary: use RMSTrough (quietest segments)
1861-
noiseFloor = rmsTrough
1862-
} else if rmsLevel != 0 && !math.IsInf(rmsLevel, -1) {
1863-
// Fallback: estimate from overall RMS level
1864-
// Quiet segments are typically 12-18dB below average RMS
1865-
noiseFloor = rmsLevel - 15.0
1866-
} else {
1867-
// No measurements available, use conservative default
1868-
return -50.0, nil // Default silencedetect threshold
1869-
}
1870-
1871-
return noiseFloor, nil
1872-
}
1873-
1874-
// calculateAdaptiveSilenceThreshold computes the silencedetect threshold from the pre-scan noise floor.
1755+
// calculateAdaptiveSilenceThreshold computes a bounded silence threshold from a noise floor estimate.
18751756
// Returns a threshold that's slightly above the noise floor to detect quiet room tone as silence.
1757+
// This is used as a fallback when interval-based estimation has insufficient data.
18761758
func calculateAdaptiveSilenceThreshold(noiseFloor float64) float64 {
18771759
// Silence threshold = noise floor + headroom
18781760
// This allows silencedetect to find regions that are at or slightly above the ambient noise
1879-
threshold := noiseFloor + preScanSilenceHeadroom
1761+
threshold := noiseFloor + silenceFallbackHeadroom
18801762

18811763
// Apply bounds to prevent extreme values
1882-
if threshold < preScanMinThreshold {
1883-
threshold = preScanMinThreshold
1764+
if threshold < silenceMinThreshold {
1765+
threshold = silenceMinThreshold
18841766
}
1885-
if threshold > preScanMaxThreshold {
1886-
threshold = preScanMaxThreshold
1767+
if threshold > silenceMaxThreshold {
1768+
threshold = silenceMaxThreshold
18871769
}
18881770

18891771
return threshold
@@ -1894,15 +1776,12 @@ func calculateAdaptiveSilenceThreshold(noiseFloor float64) float64 {
18941776
//
18951777
// Implementation note: ebur128 and astats write measurements to frame metadata with lavfi.r128.*
18961778
// and lavfi.astats.Overall.* keys respectively. We extract these from the last processed frames.
1779+
//
1780+
// The noise floor and silence threshold are computed from interval data AFTER the full pass,
1781+
// eliminating the need for a separate pre-scan phase.
18971782
func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback func(pass int, passName string, progress float64, level float64, measurements *AudioMeasurements)) (*AudioMeasurements, error) {
1898-
// Pre-scan: Estimate noise floor from first 15% of audio to set adaptive silencedetect threshold
1899-
// This allows detection of intentional room tone that may be quieter than the default -50 dBFS
1900-
preScanNF, err := preScanNoiseFloor(filename)
1901-
if err != nil {
1902-
// Non-fatal: fall back to default threshold if pre-scan fails
1903-
preScanNF = -50.0
1904-
}
1905-
adaptiveThreshold := calculateAdaptiveSilenceThreshold(preScanNF)
1783+
// Default fallback threshold if interval analysis yields insufficient data
1784+
const defaultNoiseFloor = -50.0
19061785

19071786
// Open audio file
19081787
reader, metadata, err := audio.OpenAudioFile(filename)
@@ -2061,11 +1940,20 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f
20611940
ffmpeg.AVFilterGraphFree(&filterGraph)
20621941
filterFreed = true
20631942

1943+
// Estimate noise floor and silence threshold from interval data
1944+
// This replaces the previous separate pre-scan pass
1945+
noiseFloorEstimate, silenceThreshold, ok := estimateNoiseFloorAndThreshold(intervals)
1946+
if !ok {
1947+
// Fallback if insufficient interval data (very short recordings)
1948+
noiseFloorEstimate = defaultNoiseFloor
1949+
silenceThreshold = calculateAdaptiveSilenceThreshold(defaultNoiseFloor)
1950+
}
1951+
20641952
// Create measurements struct and populate from accumulators
20651953
measurements := &AudioMeasurements{
2066-
// Store pre-scan adaptive silence detection thresholds
2067-
PreScanNoiseFloor: preScanNF,
2068-
SilenceDetectLevel: adaptiveThreshold,
1954+
// Noise floor estimated from interval data (replaces pre-scan)
1955+
PreScanNoiseFloor: noiseFloorEstimate,
1956+
SilenceDetectLevel: silenceThreshold,
20691957
}
20701958

20711959
// Populate ebur128 loudness measurements
@@ -2178,11 +2066,8 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f
21782066
// Store 250ms interval samples for data-driven silence candidate detection
21792067
measurements.IntervalSamples = intervals
21802068

2181-
// Detect silence regions using data-driven threshold from interval distribution
2182-
// Instead of arbitrary headroom, derive threshold from the actual RMS distribution:
2183-
// Silence intervals are statistical outliers at the low end of the distribution
2184-
silenceThreshold := calculateSilenceThresholdFromIntervals(intervals, adaptiveThreshold)
2185-
measurements.SilenceDetectLevel = silenceThreshold // Update with actual computed threshold
2069+
// Detect silence regions using threshold already computed from interval distribution
2070+
// The silenceThreshold was calculated above via estimateNoiseFloorAndThreshold()
21862071
measurements.SilenceRegions = findSilenceCandidatesFromIntervals(intervals, silenceThreshold, 0)
21872072

21882073
// Extract noise profile from best silence region (if available)

0 commit comments

Comments
 (0)