Skip to content

Commit 832dfbc

Browse files
committed
feat(processor): make NoiseRemove compand parameters adaptive to measured noise floor
Previously, tuneNoiseRemove() used fixed values (-55dB threshold, 6dB expansion) regardless of actual noise characteristics. This meant clean recordings received no adaptive benefit while noisy recordings could have speech unnecessarily affected. Now the compand filter adapts to measured noise from silence regions: - Threshold: noise floor + 5dB (clamped to [-70, -40]) - Expansion: scales with noise severity (4-12dB across 4 tiers) Default values updated to -55dB/6dB as fallback when no noise profile available. Enhanced logging shows measured noise floor and adaptive derivation rationale. Add TestScaleExpansion with 8 test cases covering all expansion tiers. Signed-off-by: Martin Wimpress <martin@wimpress.org> feat(processor): prefer speech-specific spectral metrics for adaptive filter tuning Previously, adaptive filter tuning used full-file spectral measurements which are diluted by silence periods. The SpeechProfile struct contains speech-specific versions of spectral metrics that provide more accurate input for tuning filters that process speech content. Now these filters prefer speech-specific measurements when available: - tuneDeesserFull: centroid and rolloff for sibilance detection - tuneLA2ARatio: kurtosis for harmonic structure decisions - tuneLA2ARelease: flux for timing decisions - tuneDC1Declick: centroid for window sizing Falls back to full-file measurements when SpeechProfile is nil or has zero values. Added preferSpeechMetric helper with 4 test cases covering available, zero, negative, and both-zero scenarios. Signed-off-by: Martin Wimpress <martin@wimpress.org>
1 parent 7b7c26f commit 832dfbc

4 files changed

Lines changed: 253 additions & 92 deletions

File tree

internal/logging/report.go

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ func formatFilter(f *os.File, filterID processor.FilterID, cfg *processor.Filter
494494
case processor.FilterNoiseRemove:
495495
formatNoiseRemoveFilter(f, cfg, m, prefix)
496496
case processor.FilterDC1Declick:
497-
formatDC1DeclickFilter(f, cfg, prefix)
497+
formatDC1DeclickFilter(f, cfg, m, prefix)
498498
case processor.FilterDS201Gate:
499499
formatDS201GateFilter(f, cfg, m, prefix)
500500
case processor.FilterLA2ACompressor:
@@ -655,25 +655,26 @@ func formatNoiseRemoveFilter(f *os.File, cfg *processor.FilterChainConfig, m *pr
655655
cfg.NoiseRemoveResearchSec,
656656
cfg.NoiseRemoveSmooth)
657657

658-
// compand parameters (adaptive)
659-
fmt.Fprintf(f, " compand: threshold %.0f dB, expansion %.0f dB\n",
660-
cfg.NoiseRemoveCompandThreshold,
661-
cfg.NoiseRemoveCompandExpansion)
658+
// compand parameters and rationale - show noise floor source
659+
if m != nil && m.NoiseProfile != nil && m.NoiseProfile.MeasuredNoiseFloor < 0 {
660+
fmt.Fprintf(f, " noise floor: %.1f dBFS (from silence regions)\n",
661+
m.NoiseProfile.MeasuredNoiseFloor)
662+
fmt.Fprintf(f, " compand: threshold %.0f dB (floor + 5dB), expansion %.0f dB\n",
663+
cfg.NoiseRemoveCompandThreshold,
664+
cfg.NoiseRemoveCompandExpansion)
665+
} else {
666+
fmt.Fprintf(f, " compand: threshold %.0f dB, expansion %.0f dB (defaults - no noise profile)\n",
667+
cfg.NoiseRemoveCompandThreshold,
668+
cfg.NoiseRemoveCompandExpansion)
669+
}
662670
fmt.Fprintf(f, " timing: attack %.0fms, decay %.0fms, knee %.0f dB\n",
663671
cfg.NoiseRemoveCompandAttack*1000,
664672
cfg.NoiseRemoveCompandDecay*1000,
665673
cfg.NoiseRemoveCompandKnee)
666-
667-
// Show adaptive rationale if noise profile available
668-
if m != nil && m.NoiseProfile != nil && m.NoiseProfile.Duration > 0 {
669-
fmt.Fprintf(f, " Rationale: noise floor %.1f dB → target -90 dB (%.0f dB expansion)\n",
670-
m.NoiseProfile.MeasuredNoiseFloor,
671-
cfg.NoiseRemoveCompandExpansion)
672-
}
673674
}
674675

675676
// formatDC1DeclickFilter outputs CEDAR DC-1-inspired declicker filter details
676-
func formatDC1DeclickFilter(f *os.File, cfg *processor.FilterChainConfig, prefix string) {
677+
func formatDC1DeclickFilter(f *os.File, cfg *processor.FilterChainConfig, m *processor.AudioMeasurements, prefix string) {
677678
if !cfg.DC1DeclickEnabled {
678679
if cfg.DC1DeclickReason != "" {
679680
fmt.Fprintf(f, "%sDC1 Declick: DISABLED (%s)\n", prefix, cfg.DC1DeclickReason)
@@ -694,6 +695,17 @@ func formatDC1DeclickFilter(f *os.File, cfg *processor.FilterChainConfig, prefix
694695
if cfg.DC1DeclickReason != "" {
695696
fmt.Fprintf(f, " Reason: %s\n", cfg.DC1DeclickReason)
696697
}
698+
699+
// Show centroid with measurement source (used for window sizing)
700+
if m != nil && m.SpectralCentroid > 0 {
701+
centroid := m.SpectralCentroid
702+
centroidSource := "full-file"
703+
if m.SpeechProfile != nil && m.SpeechProfile.SpectralCentroid > 0 {
704+
centroid = m.SpeechProfile.SpectralCentroid
705+
centroidSource = "speech region"
706+
}
707+
fmt.Fprintf(f, " spectral centroid: %.0f Hz (%s)\n", centroid, centroidSource)
708+
}
697709
}
698710

699711
// joinWithComma joins string slice with comma separator
@@ -804,7 +816,7 @@ func formatLA2ACompressorFilter(f *os.File, cfg *processor.FilterChainConfig, m
804816
fmt.Fprintf(f, " Timing: attack %.0fms, release %.0fms\n", cfg.LA2AAttack, cfg.LA2ARelease)
805817
fmt.Fprintf(f, " Mix: %.0f%%, knee %.1f\n", cfg.LA2AMix*100, cfg.LA2AKnee)
806818

807-
// Show rationale
819+
// Show rationale with measurement sources
808820
if m != nil && m.DynamicRange > 0 {
809821
dynamicsType := "moderate"
810822
if m.DynamicRange > 30 {
@@ -813,29 +825,69 @@ func formatLA2ACompressorFilter(f *os.File, cfg *processor.FilterChainConfig, m
813825
dynamicsType = "already compressed"
814826
}
815827
fmt.Fprintf(f, " Rationale: DR %.1f dB (%s), LRA %.1f LU\n", m.DynamicRange, dynamicsType, m.InputLRA)
828+
829+
// Show kurtosis and flux with sources (used for ratio and release tuning)
830+
kurtosis := m.SpectralKurtosis
831+
flux := m.SpectralFlux
832+
kurtosisSource := "full-file"
833+
fluxSource := "full-file"
834+
if m.SpeechProfile != nil {
835+
if m.SpeechProfile.SpectralKurtosis > 0 {
836+
kurtosis = m.SpeechProfile.SpectralKurtosis
837+
kurtosisSource = "speech region"
838+
}
839+
if m.SpeechProfile.SpectralFlux > 0 {
840+
flux = m.SpeechProfile.SpectralFlux
841+
fluxSource = "speech region"
842+
}
843+
}
844+
fmt.Fprintf(f, " spectral kurtosis: %.1f (%s)\n", kurtosis, kurtosisSource)
845+
fmt.Fprintf(f, " spectral flux: %.4f (%s)\n", flux, fluxSource)
816846
}
817847
}
818848

819849
// formatDeesserFilter outputs deesser filter details
820850
func formatDeesserFilter(f *os.File, cfg *processor.FilterChainConfig, m *processor.AudioMeasurements, prefix string) {
821-
if !cfg.DeessEnabled || cfg.DeessIntensity == 0 {
851+
if !cfg.DeessEnabled {
822852
fmt.Fprintf(f, "%sdeesser: DISABLED\n", prefix)
823853
return
824854
}
855+
if cfg.DeessIntensity == 0 {
856+
// Enabled but intensity is 0 - adaptive tuning determined no de-essing needed
857+
fmt.Fprintf(f, "%sdeesser: inactive: no sibilance detected\n", prefix)
858+
return
859+
}
825860

826861
fmt.Fprintf(f, "%sdeesser: intensity %.0f%%, amount %.0f%%, freq %.0f%%\n",
827862
prefix, cfg.DeessIntensity*100, cfg.DeessAmount*100, cfg.DeessFreq*100)
828863

829-
// Show rationale
864+
// Show rationale with measurement source
830865
if m != nil && m.SpectralCentroid > 0 {
866+
// Determine which values were used and their sources
867+
centroid := m.SpectralCentroid
868+
rolloff := m.SpectralRolloff
869+
centroidSource := "full-file"
870+
rolloffSource := "full-file"
871+
if m.SpeechProfile != nil {
872+
if m.SpeechProfile.SpectralCentroid > 0 {
873+
centroid = m.SpeechProfile.SpectralCentroid
874+
centroidSource = "speech region"
875+
}
876+
if m.SpeechProfile.SpectralRolloff > 0 {
877+
rolloff = m.SpeechProfile.SpectralRolloff
878+
rolloffSource = "speech region"
879+
}
880+
}
881+
831882
voiceType := "normal"
832-
if m.SpectralCentroid > 7000 {
883+
if centroid > 7000 {
833884
voiceType = "very bright"
834-
} else if m.SpectralCentroid > 6000 {
885+
} else if centroid > 6000 {
835886
voiceType = "bright"
836887
}
837-
fmt.Fprintf(f, " Rationale: %s voice (centroid %.0f Hz, rolloff %.0f Hz)\n",
838-
voiceType, m.SpectralCentroid, m.SpectralRolloff)
888+
fmt.Fprintf(f, " Rationale: %s voice\n", voiceType)
889+
fmt.Fprintf(f, " spectral centroid: %.0f Hz (%s)\n", centroid, centroidSource)
890+
fmt.Fprintf(f, " spectral rolloff: %.0f Hz (%s)\n", rolloff, rolloffSource)
839891
}
840892
}
841893

internal/processor/adaptive.go

Lines changed: 87 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -716,57 +716,89 @@ func tuneDC1Declick(config *FilterChainConfig, measurements *AudioMeasurements)
716716
config.DC1DeclickReason += "; +threshold (compressed)"
717717
}
718718

719+
// Prefer speech-specific centroid for window sizing
720+
centroid := measurements.SpectralCentroid
721+
if measurements.SpeechProfile != nil {
722+
centroid = preferSpeechMetric(centroid, measurements.SpeechProfile.SpectralCentroid)
723+
}
724+
719725
// Window adaptation based on content type
720726
switch {
721-
case measurements.SpectralCentroid > dc1CentroidFast:
727+
case centroid > dc1CentroidFast:
722728
// Fast speech/plosives - shorter window preserves transients
723729
config.DC1DeclickWindow = dc1WindowShort
724-
case measurements.SpectralCentroid < dc1CentroidSlow:
730+
case centroid < dc1CentroidSlow:
725731
// Bass-heavy content - longer window for better LF reconstruction
726732
config.DC1DeclickWindow = dc1WindowLong
727733
default:
728734
config.DC1DeclickWindow = dc1WindowDefault
729735
}
730736
}
731737

732-
// tuneNoiseRemove configures NoiseRemove compand parameters for residual suppression.
733-
// The anlmdn parameters (strength, patch, research, smooth) are kept constant from spike validation.
734-
//
735-
// POST-ANLMDN COMPAND STRATEGY (2024-12-24):
736-
// Since anlmdn now handles the heavy noise reduction (achieving "digital black" for clean
737-
// sources and 37+ dB reduction for noisy sources), compand's role is now:
738-
// - Residual noise suppression in silence regions
739-
// - Breath noise attenuation between speech
740-
// - NOT primary noise reduction
738+
// tuneNoiseRemove adjusts compand parameters based on measured noise floor.
739+
// Uses silence region measurements for accurate noise characterisation.
741740
//
742-
// Spike testing (mcompand-spike.sh at 1500s) validated these settings:
743-
// - Fixed 6 dB expansion: provides ~5-6 dB breath/residual attenuation
744-
// - Fixed -55 dB threshold: catches breaths without affecting speech
745-
// - Single-band compand: more transparent than mcompand (0% spectral change)
741+
// The anlmdn parameters (strength, patch, research, smooth) are kept constant from spike validation.
742+
// Compand parameters adapt to the measured noise floor:
743+
// - Threshold: 5dB above noise floor (catches breaths but not speech)
744+
// - Expansion: scales with noise severity (gentle for clean, aggressive for noisy)
746745
//
747746
// anlmdn remains constant because spike testing validated these parameters:
748747
// - strength: 0.00001 (minimum)
749748
// - patch: 6ms (context window)
750749
// - research: 5.8ms (search window)
751750
// - smooth: 11 (weight smoothing)
752-
func tuneNoiseRemove(config *FilterChainConfig, measurements *AudioMeasurements) {
751+
func tuneNoiseRemove(config *FilterChainConfig, m *AudioMeasurements) {
753752
if !config.NoiseRemoveEnabled {
754753
return
755754
}
756755

757-
// Fixed compand parameters validated in spike testing (mcompand-spike.sh)
758-
// These are intentionally NOT adaptive — anlmdn handles the adaptive part
759-
const (
760-
compandThreshold = -55.0 // dB - catches breaths without affecting speech
761-
compandExpansion = 6.0 // dB - gentle push for residual/breath attenuation
762-
)
756+
// Default values (fallback if no noise profile)
757+
threshold := -55.0
758+
expansion := 6.0
759+
760+
if m.NoiseProfile != nil && m.NoiseProfile.MeasuredNoiseFloor < 0 {
761+
noiseFloor := m.NoiseProfile.MeasuredNoiseFloor
763762

764-
config.NoiseRemoveCompandThreshold = compandThreshold
765-
config.NoiseRemoveCompandExpansion = compandExpansion
763+
// Threshold: 5dB above noise floor (catches breaths but not speech)
764+
threshold = noiseFloor + 5.0
765+
// Clamp to reasonable range
766+
threshold = clamp(threshold, -70.0, -40.0)
767+
768+
// Expansion: scale with noise severity
769+
expansion = scaleExpansion(noiseFloor)
770+
}
771+
772+
config.NoiseRemoveCompandThreshold = threshold
773+
config.NoiseRemoveCompandExpansion = expansion
766774

767775
// attack, decay, knee stay constant (validated in spike testing)
768776
}
769777

778+
// preferSpeechMetric returns speech-specific measurement if available,
779+
// otherwise falls back to full-file measurement.
780+
func preferSpeechMetric(fullFile, speechProfile float64) float64 {
781+
if speechProfile > 0 {
782+
return speechProfile
783+
}
784+
return fullFile
785+
}
786+
787+
// scaleExpansion returns expansion depth based on noise severity.
788+
// Noisier recordings need more aggressive expansion to suppress residuals.
789+
func scaleExpansion(noiseFloor float64) float64 {
790+
switch {
791+
case noiseFloor > -45.0:
792+
return 12.0 // Very noisy - aggressive
793+
case noiseFloor > -55.0:
794+
return 8.0 // Moderate noise
795+
case noiseFloor > -65.0:
796+
return 6.0 // Typical
797+
default:
798+
return 4.0 // Very clean - gentle
799+
}
800+
}
801+
770802
// tuneDeesser adapts de-esser intensity based on spectral analysis.
771803
// Uses both spectral centroid (energy concentration) and rolloff (HF extension)
772804
// to detect likelihood of harsh sibilance.
@@ -793,31 +825,39 @@ func tuneDeesser(config *FilterChainConfig, measurements *AudioMeasurements) {
793825

794826
// tuneDeesserFull uses both centroid and rolloff for precise de-esser tuning
795827
func tuneDeesserFull(config *FilterChainConfig, measurements *AudioMeasurements) {
828+
// Prefer speech-specific measurements for sibilance detection
829+
centroid := measurements.SpectralCentroid
830+
rolloff := measurements.SpectralRolloff
831+
if measurements.SpeechProfile != nil {
832+
centroid = preferSpeechMetric(centroid, measurements.SpeechProfile.SpectralCentroid)
833+
rolloff = preferSpeechMetric(rolloff, measurements.SpeechProfile.SpectralRolloff)
834+
}
835+
796836
// Determine baseline intensity from centroid
797837
var baseIntensity float64
798838
switch {
799-
case measurements.SpectralCentroid > centroidVeryBright:
839+
case centroid > centroidVeryBright:
800840
baseIntensity = deessIntensityBright // Bright voice
801-
case measurements.SpectralCentroid > centroidBright:
841+
case centroid > centroidBright:
802842
baseIntensity = deessIntensityNormal // Normal voice
803843
default:
804844
baseIntensity = deessIntensityDark // Dark voice
805845
}
806846

807847
// Refine based on spectral rolloff (HF extension)
808848
switch {
809-
case measurements.SpectralRolloff < rolloffNoSibilance:
849+
case rolloff < rolloffNoSibilance:
810850
// Very limited HF content - no sibilance expected
811851
config.DeessIntensity = 0.0
812852

813-
case measurements.SpectralRolloff < rolloffLimited:
853+
case rolloff < rolloffLimited:
814854
// Limited HF extension - reduce intensity
815855
config.DeessIntensity = baseIntensity * 0.7
816856
if config.DeessIntensity < deessIntensityMin {
817857
config.DeessIntensity = 0.0 // Skip if too low
818858
}
819859

820-
case measurements.SpectralRolloff > rolloffExtensive:
860+
case rolloff > rolloffExtensive:
821861
// Extensive HF content - likely sibilance
822862
config.DeessIntensity = math.Min(baseIntensity*1.2, deessIntensityMax)
823863

@@ -1216,6 +1256,12 @@ func tuneLA2AAttack(config *FilterChainConfig, measurements *AudioMeasurements)
12161256
// - Narrow LRA + low flux = compressed/monotone, faster release OK
12171257
// - Warm voices (high skewness) get extra release to preserve body
12181258
func tuneLA2ARelease(config *FilterChainConfig, measurements *AudioMeasurements) {
1259+
// Prefer speech-specific flux for timing decisions
1260+
flux := measurements.SpectralFlux
1261+
if measurements.SpeechProfile != nil {
1262+
flux = preferSpeechMetric(flux, measurements.SpeechProfile.SpectralFlux)
1263+
}
1264+
12191265
// Start with standard LA-2A-style release
12201266
release := la2aReleaseStandard
12211267

@@ -1230,12 +1276,12 @@ func tuneLA2ARelease(config *FilterChainConfig, measurements *AudioMeasurements)
12301276
}
12311277

12321278
// Adjust based on spectral flux (frame-to-frame variation)
1233-
if measurements.SpectralFlux > 0 {
1279+
if flux > 0 {
12341280
switch {
1235-
case measurements.SpectralFlux > la2aFluxDynamic:
1281+
case flux > la2aFluxDynamic:
12361282
// Dynamic/expressive content - add release time
12371283
release = math.Max(release, la2aReleaseExpressive)
1238-
case measurements.SpectralFlux < la2aFluxStatic:
1284+
case flux < la2aFluxStatic:
12391285
// Static/monotone content - can use shorter release
12401286
release = math.Min(release, la2aReleaseCompact)
12411287
}
@@ -1265,16 +1311,22 @@ func tuneLA2ARelease(config *FilterChainConfig, measurements *AudioMeasurements)
12651311
// - Peaked/tonal content (high kurtosis) = gentler ratio, preserve character
12661312
// - Flat/noise-like content (low kurtosis) = firmer ratio, more levelling
12671313
func tuneLA2ARatio(config *FilterChainConfig, measurements *AudioMeasurements) {
1314+
// Prefer speech-specific kurtosis for harmonic structure
1315+
kurtosis := measurements.SpectralKurtosis
1316+
if measurements.SpeechProfile != nil {
1317+
kurtosis = preferSpeechMetric(kurtosis, measurements.SpeechProfile.SpectralKurtosis)
1318+
}
1319+
12681320
// Start with LA-2A baseline ratio
12691321
ratio := la2aRatioBase
12701322

12711323
// Adjust based on spectral kurtosis (peakedness)
1272-
if measurements.SpectralKurtosis > 0 {
1324+
if kurtosis > 0 {
12731325
switch {
1274-
case measurements.SpectralKurtosis > la2aKurtosisHighPeak:
1326+
case kurtosis > la2aKurtosisHighPeak:
12751327
// Highly peaked harmonics - gentler ratio preserves character
12761328
ratio = la2aRatioPeaked
1277-
case measurements.SpectralKurtosis < la2aKurtosisLowPeak:
1329+
case kurtosis < la2aKurtosisLowPeak:
12781330
// Flat spectrum - firmer ratio for consistent levelling
12791331
ratio = la2aRatioFlat
12801332
}

0 commit comments

Comments
 (0)