@@ -670,18 +670,19 @@ func roomToneScore(interval IntervalSample, rmsP50, fluxP50 float64) float64 {
670670 return roomToneAmplitudeWeight * amplitudeScore + roomToneFluxWeight * fluxScore
671671}
672672
673- // calculateSilenceThresholdFromIntervals derives the silence threshold from interval data .
674- // Uses spectral analysis to identify room tone by its characteristic stability and quietness .
673+ // estimateNoiseFloorAndThreshold analyses interval data to estimate noise floor and silence threshold .
674+ // Returns (noiseFloor, silenceThreshold, ok). If ok is false, fallback values should be used .
675675//
676- // Key insight: room tone detection should use behavioral characteristics, not just amplitude :
676+ // Uses spectral analysis to identify room tone by its characteristic stability and quietness :
677677// 1. Room tone is quieter than speech (but may overlap with quiet speech)
678678// 2. Room tone has low spectral flux (stable, unchanging)
679679// 3. Room tone has consistent spectral characteristics
680680//
681- // We compute a "room tone score" for each interval and use that to find the threshold.
682- func calculateSilenceThresholdFromIntervals (intervals []IntervalSample , fallbackThreshold float64 ) float64 {
681+ // The noise floor is the max RMS of high-confidence room tone intervals.
682+ // The silence threshold adds headroom to the noise floor for detection margin.
683+ func estimateNoiseFloorAndThreshold (intervals []IntervalSample ) (noiseFloor , silenceThreshold float64 , ok bool ) {
683684 if len (intervals ) < silenceThresholdMinIntervals {
684- return fallbackThreshold
685+ return 0 , 0 , false
685686 }
686687
687688 // Only use the first silenceSearchPercent% of intervals for threshold calculation
@@ -734,16 +735,26 @@ func calculateSilenceThresholdFromIntervals(intervals []IntervalSample, fallback
734735 candidateCount = len (scored )
735736 }
736737
737- // Threshold is the maximum RMS among high-confidence room tone intervals
738- // Add small headroom to catch edge cases
738+ // Noise floor is the maximum RMS among high-confidence room tone intervals
739739 maxRoomToneRMS := - 120.0
740740 for i := 0 ; i < candidateCount ; i ++ {
741741 if scored [i ].rms > maxRoomToneRMS {
742742 maxRoomToneRMS = scored [i ].rms
743743 }
744744 }
745745
746- return maxRoomToneRMS + silenceThresholdHeadroomDB
746+ return maxRoomToneRMS , maxRoomToneRMS + silenceThresholdHeadroomDB , true
747+ }
748+
749+ // calculateSilenceThresholdFromIntervals derives the silence threshold from interval data.
750+ // This is a convenience wrapper around estimateNoiseFloorAndThreshold for callers that only
751+ // need the threshold value.
752+ func calculateSilenceThresholdFromIntervals (intervals []IntervalSample , fallbackThreshold float64 ) float64 {
753+ _ , threshold , ok := estimateNoiseFloorAndThreshold (intervals )
754+ if ! ok {
755+ return fallbackThreshold
756+ }
757+ return threshold
747758}
748759
749760// findSilenceCandidatesFromIntervals identifies silence regions from interval samples.
@@ -1443,8 +1454,8 @@ type AudioMeasurements struct {
14431454 TargetOffset float64 `json:"target_offset"` // Offset for normalization
14441455 NoiseFloor float64 `json:"noise_floor"` // Measured noise floor from astats (dBFS)
14451456
1446- // Pre-scan adaptive silence detection thresholds
1447- PreScanNoiseFloor float64 `json:"prescan_noise_floor"` // Noise floor estimated from first 15% of audio (dBFS)
1457+ // Adaptive silence detection thresholds (derived from interval sampling)
1458+ PreScanNoiseFloor float64 `json:"prescan_noise_floor"` // Noise floor estimated from first 15% of intervals (dBFS)
14481459 SilenceDetectLevel float64 `json:"silence_detect_level"` // Adaptive silencedetect threshold used (dBFS)
14491460
14501461 // Silence detection results (derived from interval sampling)
@@ -1725,165 +1736,36 @@ func finalizeOutputMeasurements(acc *outputMetadataAccumulators) *OutputMeasurem
17251736 return m
17261737}
17271738
1728- // Pre-scan constants
1739+ // Threshold bounds for adaptive silence detection
17291740const (
1730- // preScanPercent is the fraction of audio to scan for adaptive threshold detection.
1731- // We scan the first 15% of the recording to estimate the noise floor before running
1732- // the full Pass 1 analysis. This allows silencedetect to use an adaptive threshold.
1733- preScanPercent = 0.15
1734-
1735- // preScanSilenceHeadroom is added to the pre-scan noise floor to get the silencedetect threshold.
1741+ // silenceFallbackHeadroom is added to the noise floor to get the silencedetect threshold.
17361742 // A region is considered "silence" if it's within this headroom of the noise floor.
17371743 // Higher values detect more silence (including quieter room tone) but may include crosstalk.
1738- preScanSilenceHeadroom = 6.0 // dB
1744+ silenceFallbackHeadroom = 6.0 // dB
17391745
1740- // preScanMinThreshold prevents silencedetect from being too sensitive in very quiet recordings.
1746+ // silenceMinThreshold prevents silencedetect from being too sensitive in very quiet recordings.
17411747 // Even professional recordings rarely have silence below -70 dBFS.
1742- preScanMinThreshold = - 70.0
1748+ silenceMinThreshold = - 70.0
17431749
1744- // preScanMaxThreshold prevents silencedetect from detecting loud sections as silence.
1750+ // silenceMaxThreshold prevents silencedetect from detecting loud sections as silence.
17451751 // If the estimated threshold is above this, something is wrong with the recording.
1746- preScanMaxThreshold = - 35.0
1752+ silenceMaxThreshold = - 35.0
17471753)
17481754
1749- // preScanNoiseFloor performs a quick scan of the first 15% of the audio to estimate
1750- // the noise floor. This is used to set an adaptive silencedetect threshold for Pass 1.
1751- //
1752- // The function runs astats on the initial portion of audio and returns the RMSTrough
1753- // measurement, which represents the level of the quietest segments (likely ambient noise).
1754- //
1755- // Returns the estimated noise floor in dBFS, or an error if the scan fails.
1756- // If RMSTrough is not available, falls back to RMSLevel - 15dB.
1757- func preScanNoiseFloor (filename string ) (float64 , error ) {
1758- // Open audio file
1759- reader , metadata , err := audio .OpenAudioFile (filename )
1760- if err != nil {
1761- return 0 , fmt .Errorf ("pre-scan: failed to open audio file: %w" , err )
1762- }
1763- defer reader .Close ()
1764-
1765- // Calculate how many frames to process (15% of audio)
1766- totalDuration := metadata .Duration
1767- scanDuration := totalDuration * preScanPercent
1768- sampleRate := float64 (metadata .SampleRate )
1769- samplesPerFrame := 4096.0
1770- maxFrames := int ((scanDuration * sampleRate ) / samplesPerFrame )
1771-
1772- // Create a simple filter graph with just downmix and astats
1773- // We need mono for consistent measurements, and astats for noise floor detection
1774- filterSpec := "aformat=channel_layouts=mono,astats=metadata=1:measure_overall=Noise_floor+RMS_level+RMS_trough+Peak_level:measure_perchannel=0"
1775-
1776- filterGraph , bufferSrcCtx , bufferSinkCtx , err := setupFilterGraph (
1777- reader .GetDecoderContext (),
1778- filterSpec ,
1779- )
1780- if err != nil {
1781- return 0 , fmt .Errorf ("pre-scan: failed to create filter graph: %w" , err )
1782- }
1783- defer ffmpeg .AVFilterGraphFree (& filterGraph )
1784-
1785- filteredFrame := ffmpeg .AVFrameAlloc ()
1786- defer ffmpeg .AVFrameFree (& filteredFrame )
1787-
1788- // Track measurements
1789- var rmsTrough , rmsLevel float64
1790- frameCount := 0
1791-
1792- // Process frames until we've scanned 15% of the audio
1793- for frameCount < maxFrames {
1794- frame , err := reader .ReadFrame ()
1795- if err != nil {
1796- return 0 , fmt .Errorf ("pre-scan: failed to read frame: %w" , err )
1797- }
1798- if frame == nil {
1799- break // EOF
1800- }
1801-
1802- // Push frame into filter graph
1803- if _ , err := ffmpeg .AVBuffersrcAddFrameFlags (bufferSrcCtx , frame , 0 ); err != nil {
1804- return 0 , fmt .Errorf ("pre-scan: failed to add frame to filter: %w" , err )
1805- }
1806- frameCount ++
1807-
1808- // Pull filtered frames and extract astats metadata
1809- for {
1810- if _ , err := ffmpeg .AVBuffersinkGetFrame (bufferSinkCtx , filteredFrame ); err != nil {
1811- if errors .Is (err , ffmpeg .EAgain ) || errors .Is (err , ffmpeg .AVErrorEOF ) {
1812- break
1813- }
1814- return 0 , fmt .Errorf ("pre-scan: failed to get filtered frame: %w" , err )
1815- }
1816-
1817- // Extract astats measurements from metadata
1818- if metadata := filteredFrame .Metadata (); metadata != nil {
1819- if value , ok := getFloatMetadata (metadata , metaKeyRMSTrough ); ok {
1820- rmsTrough = value
1821- }
1822- if value , ok := getFloatMetadata (metadata , metaKeyRMSLevel ); ok {
1823- rmsLevel = value
1824- }
1825- }
1826-
1827- ffmpeg .AVFrameUnref (filteredFrame )
1828- }
1829- }
1830-
1831- // Flush the filter graph
1832- if _ , err := ffmpeg .AVBuffersrcAddFrameFlags (bufferSrcCtx , nil , 0 ); err != nil {
1833- return 0 , fmt .Errorf ("pre-scan: failed to flush filter: %w" , err )
1834- }
1835-
1836- // Pull remaining frames
1837- for {
1838- if _ , err := ffmpeg .AVBuffersinkGetFrame (bufferSinkCtx , filteredFrame ); err != nil {
1839- if errors .Is (err , ffmpeg .EAgain ) || errors .Is (err , ffmpeg .AVErrorEOF ) {
1840- break
1841- }
1842- return 0 , fmt .Errorf ("pre-scan: failed to get remaining frame: %w" , err )
1843- }
1844-
1845- if metadata := filteredFrame .Metadata (); metadata != nil {
1846- if value , ok := getFloatMetadata (metadata , metaKeyRMSTrough ); ok {
1847- rmsTrough = value
1848- }
1849- if value , ok := getFloatMetadata (metadata , metaKeyRMSLevel ); ok {
1850- rmsLevel = value
1851- }
1852- }
1853-
1854- ffmpeg .AVFrameUnref (filteredFrame )
1855- }
1856-
1857- // Determine noise floor from measurements
1858- var noiseFloor float64
1859- if rmsTrough != 0 && ! math .IsInf (rmsTrough , - 1 ) {
1860- // Primary: use RMSTrough (quietest segments)
1861- noiseFloor = rmsTrough
1862- } else if rmsLevel != 0 && ! math .IsInf (rmsLevel , - 1 ) {
1863- // Fallback: estimate from overall RMS level
1864- // Quiet segments are typically 12-18dB below average RMS
1865- noiseFloor = rmsLevel - 15.0
1866- } else {
1867- // No measurements available, use conservative default
1868- return - 50.0 , nil // Default silencedetect threshold
1869- }
1870-
1871- return noiseFloor , nil
1872- }
1873-
1874- // calculateAdaptiveSilenceThreshold computes the silencedetect threshold from the pre-scan noise floor.
1755+ // calculateAdaptiveSilenceThreshold computes a bounded silence threshold from a noise floor estimate.
18751756// Returns a threshold that's slightly above the noise floor to detect quiet room tone as silence.
1757+ // This is used as a fallback when interval-based estimation has insufficient data.
18761758func calculateAdaptiveSilenceThreshold (noiseFloor float64 ) float64 {
18771759 // Silence threshold = noise floor + headroom
18781760 // This allows silencedetect to find regions that are at or slightly above the ambient noise
1879- threshold := noiseFloor + preScanSilenceHeadroom
1761+ threshold := noiseFloor + silenceFallbackHeadroom
18801762
18811763 // Apply bounds to prevent extreme values
1882- if threshold < preScanMinThreshold {
1883- threshold = preScanMinThreshold
1764+ if threshold < silenceMinThreshold {
1765+ threshold = silenceMinThreshold
18841766 }
1885- if threshold > preScanMaxThreshold {
1886- threshold = preScanMaxThreshold
1767+ if threshold > silenceMaxThreshold {
1768+ threshold = silenceMaxThreshold
18871769 }
18881770
18891771 return threshold
@@ -1894,15 +1776,12 @@ func calculateAdaptiveSilenceThreshold(noiseFloor float64) float64 {
18941776//
18951777// Implementation note: ebur128 and astats write measurements to frame metadata with lavfi.r128.*
18961778// and lavfi.astats.Overall.* keys respectively. We extract these from the last processed frames.
1779+ //
1780+ // The noise floor and silence threshold are computed from interval data AFTER the full pass,
1781+ // eliminating the need for a separate pre-scan phase.
18971782func AnalyzeAudio (filename string , config * FilterChainConfig , progressCallback func (pass int , passName string , progress float64 , level float64 , measurements * AudioMeasurements )) (* AudioMeasurements , error ) {
1898- // Pre-scan: Estimate noise floor from first 15% of audio to set adaptive silencedetect threshold
1899- // This allows detection of intentional room tone that may be quieter than the default -50 dBFS
1900- preScanNF , err := preScanNoiseFloor (filename )
1901- if err != nil {
1902- // Non-fatal: fall back to default threshold if pre-scan fails
1903- preScanNF = - 50.0
1904- }
1905- adaptiveThreshold := calculateAdaptiveSilenceThreshold (preScanNF )
1783+ // Default fallback threshold if interval analysis yields insufficient data
1784+ const defaultNoiseFloor = - 50.0
19061785
19071786 // Open audio file
19081787 reader , metadata , err := audio .OpenAudioFile (filename )
@@ -2061,11 +1940,20 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f
20611940 ffmpeg .AVFilterGraphFree (& filterGraph )
20621941 filterFreed = true
20631942
1943+ // Estimate noise floor and silence threshold from interval data
1944+ // This replaces the previous separate pre-scan pass
1945+ noiseFloorEstimate , silenceThreshold , ok := estimateNoiseFloorAndThreshold (intervals )
1946+ if ! ok {
1947+ // Fallback if insufficient interval data (very short recordings)
1948+ noiseFloorEstimate = defaultNoiseFloor
1949+ silenceThreshold = calculateAdaptiveSilenceThreshold (defaultNoiseFloor )
1950+ }
1951+
20641952 // Create measurements struct and populate from accumulators
20651953 measurements := & AudioMeasurements {
2066- // Store pre-scan adaptive silence detection thresholds
2067- PreScanNoiseFloor : preScanNF ,
2068- SilenceDetectLevel : adaptiveThreshold ,
1954+ // Noise floor estimated from interval data (replaces pre-scan)
1955+ PreScanNoiseFloor : noiseFloorEstimate ,
1956+ SilenceDetectLevel : silenceThreshold ,
20691957 }
20701958
20711959 // Populate ebur128 loudness measurements
@@ -2178,11 +2066,8 @@ func AnalyzeAudio(filename string, config *FilterChainConfig, progressCallback f
21782066 // Store 250ms interval samples for data-driven silence candidate detection
21792067 measurements .IntervalSamples = intervals
21802068
2181- // Detect silence regions using data-driven threshold from interval distribution
2182- // Instead of arbitrary headroom, derive threshold from the actual RMS distribution:
2183- // Silence intervals are statistical outliers at the low end of the distribution
2184- silenceThreshold := calculateSilenceThresholdFromIntervals (intervals , adaptiveThreshold )
2185- measurements .SilenceDetectLevel = silenceThreshold // Update with actual computed threshold
2069+ // Detect silence regions using threshold already computed from interval distribution
2070+ // The silenceThreshold was calculated above via estimateNoiseFloorAndThreshold()
21862071 measurements .SilenceRegions = findSilenceCandidatesFromIntervals (intervals , silenceThreshold , 0 )
21872072
21882073 // Extract noise profile from best silence region (if available)
0 commit comments