@@ -135,6 +135,11 @@ type SpeechCandidateMetrics struct {
135135
136136 // Scoring
137137 Score float64 `json:"score"` // Composite score for candidate ranking
138+
139+ // Golden sub-region refinement info (populated when a long candidate is refined)
140+ OriginalStart time.Duration `json:"original_start,omitempty"` // Original candidate start before refinement
141+ OriginalDuration time.Duration `json:"original_duration,omitempty"` // Original candidate duration before refinement
142+ WasRefined bool `json:"was_refined,omitempty"` // True if region was refined from a longer candidate
138143}
139144
140145// IntervalSample contains all measurements for a 250ms audio window.
@@ -530,6 +535,12 @@ const (
530535 // speechEntropyMax is the maximum entropy for speech (structured signal).
531536 // Pure noise approaches 1.0; speech is typically 0.3-0.7.
532537 speechEntropyMax = 0.85
538+
539+ // Golden speech region refinement constants
540+ // After selecting the best speech candidate, refine to a representative sub-window
541+ // to avoid averaging across pauses that contaminate spectral metrics.
542+ goldenSpeechWindowDuration = 60 * time .Second // Target: 60s of representative speech
543+ goldenSpeechWindowMinimum = 30 * time .Second // Minimum acceptable window
533544)
534545
535546// refineToGoldenSubregion finds the cleanest sub-region within a silence candidate.
@@ -788,6 +799,143 @@ func scoreIntervalWindow(intervals []IntervalSample) float64 {
788799 return sumRMS / float64 (len (intervals ))
789800}
790801
802+ // scoreSpeechIntervalWindow calculates a quality score for a contiguous window of speech intervals.
803+ // Returns a 0-1 score where higher = better quality speech for profiling.
804+ // Scores based on spectral characteristics that indicate clear, continuous speech:
805+ // - Kurtosis (0.20): higher average = clearer harmonics
806+ // - Flatness (0.20): lower = more tonal (inverted)
807+ // - Centroid (0.20): peak at voice centre (~2000 Hz), decay toward edges
808+ // - Consistency (0.25): low kurtosis variance = stable voicing
809+ // - RMS (0.15): louder = more active speech
810+ func scoreSpeechIntervalWindow (intervals []IntervalSample ) float64 {
811+ if len (intervals ) == 0 {
812+ return 0 // Should not happen in normal use
813+ }
814+
815+ n := float64 (len (intervals ))
816+
817+ // Accumulate metrics
818+ var kurtosisSum , flatnessSum , centroidSum , rmsSum float64
819+ kurtosisValues := make ([]float64 , len (intervals ))
820+
821+ for i , interval := range intervals {
822+ kurtosisSum += interval .SpectralKurtosis
823+ flatnessSum += interval .SpectralFlatness
824+ centroidSum += interval .SpectralCentroid
825+ rmsSum += interval .RMSLevel
826+ kurtosisValues [i ] = interval .SpectralKurtosis
827+ }
828+
829+ avgKurtosis := kurtosisSum / n
830+ avgFlatness := flatnessSum / n
831+ avgCentroid := centroidSum / n
832+ avgRMS := rmsSum / n
833+
834+ // Calculate kurtosis variance for consistency score
835+ var kurtosisVarianceSum float64
836+ for _ , k := range kurtosisValues {
837+ diff := k - avgKurtosis
838+ kurtosisVarianceSum += diff * diff
839+ }
840+ kurtosisVariance := kurtosisVarianceSum / n
841+
842+ // Kurtosis score (0.20): higher kurtosis = clearer harmonics
843+ // Typical speech kurtosis ranges 2-10; score peaks around 6
844+ kurtosisScore := clampFloat (avgKurtosis / 6.0 , 0.0 , 1.0 )
845+
846+ // Flatness score (0.20): lower flatness = more tonal = better speech
847+ // Flatness 0 = pure tone, 1 = white noise; speech typically 0.1-0.4
848+ flatnessScore := clampFloat (1.0 - avgFlatness , 0.0 , 1.0 )
849+
850+ // Centroid score (0.20): peak at voice centre, decay toward edges
851+ // Voice range: speechCentroidMin (200 Hz) to speechCentroidMax (4500 Hz)
852+ centroidScore := 0.0
853+ if avgCentroid >= speechCentroidMin && avgCentroid <= speechCentroidMax {
854+ // Calculate distance from ideal centre (~2000 Hz)
855+ voiceMid := (speechCentroidMin + speechCentroidMax ) / 2
856+ voiceHalfWidth := (speechCentroidMax - speechCentroidMin ) / 2
857+ distFromMid := math .Abs (avgCentroid - voiceMid )
858+ // Score decays to 0.5 at edges, 1.0 at centre
859+ centroidScore = 1.0 - (distFromMid / voiceHalfWidth )* 0.5
860+ }
861+
862+ // Consistency score (0.25): low kurtosis variance = stable voicing
863+ // Variance > 100 is very inconsistent; clamp score at that point
864+ consistencyScore := clampFloat (1.0 - (kurtosisVariance / 100.0 ), 0.0 , 1.0 )
865+
866+ // RMS score (0.15): louder = more active speech
867+ // Range: -30 dBFS (worst) to -12 dBFS (best)
868+ rmsScore := 0.0
869+ if avgRMS > - 30.0 {
870+ rmsScore = clampFloat ((avgRMS - (- 30.0 ))/ 18.0 , 0.0 , 1.0 )
871+ }
872+
873+ // Weighted combination (per plan: consistency elevated to 0.25)
874+ return kurtosisScore * 0.20 + flatnessScore * 0.20 + centroidScore * 0.20 + consistencyScore * 0.25 + rmsScore * 0.15
875+ }
876+
877+ // refineToGoldenSpeechSubregion finds the most representative sub-region within a speech candidate.
878+ // Uses existing interval samples to find the window with highest speech quality score.
879+ // Returns the original region if it's already at or below goldenSpeechWindowDuration,
880+ // or if refinement fails for any reason (insufficient intervals, etc.).
881+ //
882+ // This addresses cases where a long speech region contains pauses that contaminate
883+ // spectral metrics when averaged. By refining to the best 60s window, we isolate
884+ // continuous speech for more accurate adaptive filter tuning.
885+ func refineToGoldenSpeechSubregion (candidate * SpeechRegion , intervals []IntervalSample ) * SpeechRegion {
886+ if candidate == nil {
887+ return nil
888+ }
889+
890+ // No refinement needed if already at or below target duration
891+ if candidate .Duration <= goldenSpeechWindowDuration {
892+ return candidate
893+ }
894+
895+ // Extract intervals within the candidate's time range
896+ candidateIntervals := getIntervalsInRange (intervals , candidate .Start , candidate .End )
897+ if candidateIntervals == nil {
898+ return candidate
899+ }
900+
901+ // Calculate window size in intervals (60s / 250ms = 240 intervals)
902+ windowIntervals := int (goldenSpeechWindowDuration / goldenIntervalSize )
903+ minimumIntervals := int (goldenSpeechWindowMinimum / goldenIntervalSize )
904+
905+ // Need at least minimum window worth of intervals
906+ if len (candidateIntervals ) < minimumIntervals {
907+ return candidate
908+ }
909+
910+ // If we have fewer intervals than target window, use what we have
911+ if len (candidateIntervals ) < windowIntervals {
912+ windowIntervals = len (candidateIntervals )
913+ }
914+
915+ // Slide window across intervals, finding position with highest speech quality score
916+ bestStartIdx := 0
917+ bestScore := scoreSpeechIntervalWindow (candidateIntervals [:windowIntervals ])
918+
919+ for startIdx := 1 ; startIdx <= len (candidateIntervals )- windowIntervals ; startIdx ++ {
920+ windowScore := scoreSpeechIntervalWindow (candidateIntervals [startIdx : startIdx + windowIntervals ])
921+ if windowScore > bestScore {
922+ bestScore = windowScore
923+ bestStartIdx = startIdx
924+ }
925+ }
926+
927+ // Calculate refined region bounds from the best window position
928+ refinedStart := candidateIntervals [bestStartIdx ].Timestamp
929+ refinedDuration := time .Duration (windowIntervals ) * goldenIntervalSize
930+ refinedEnd := refinedStart + refinedDuration
931+
932+ return & SpeechRegion {
933+ Start : refinedStart ,
934+ End : refinedEnd ,
935+ Duration : refinedDuration ,
936+ }
937+ }
938+
791939// roomToneScore calculates a 0-1 score indicating how likely an interval is room tone.
792940// Room tone has characteristic spectral behaviour:
793941// - Low SpectralFlux (stable, not changing)
@@ -2992,6 +3140,8 @@ type findBestSpeechRegionResult struct {
29923140// findBestSpeechRegion selects the best speech region for measurements.
29933141// Strategy: prefer longest duration that meets quality threshold.
29943142// Unlike silence (where earlier is better), speech benefits from longer samples.
3143+ // For long candidates (>60s), refines to the best 60s sub-region to avoid
3144+ // contaminating spectral metrics with pauses.
29953145func findBestSpeechRegion (regions []SpeechRegion , intervals []IntervalSample ) * findBestSpeechRegionResult {
29963146 result := & findBestSpeechRegionResult {}
29973147
@@ -3026,6 +3176,41 @@ func findBestSpeechRegion(regions []SpeechRegion, intervals []IntervalSample) *f
30263176 }
30273177 }
30283178
3179+ // Refine long candidates to golden sub-region
3180+ if bestCandidate != nil && bestCandidate .Duration > goldenSpeechWindowDuration {
3181+ originalRegion := * bestCandidate
3182+ refined := refineToGoldenSpeechSubregion (bestCandidate , intervals )
3183+
3184+ if refined != nil {
3185+ wasRefined := refined .Start != originalRegion .Start ||
3186+ refined .Duration != originalRegion .Duration
3187+
3188+ if wasRefined {
3189+ // Re-measure the refined region
3190+ refinedMetrics := measureSpeechCandidateFromIntervals (* refined , intervals )
3191+ if refinedMetrics != nil {
3192+ refinedMetrics .Score = scoreSpeechCandidate (refinedMetrics )
3193+
3194+ // Store refinement metadata
3195+ refinedMetrics .WasRefined = true
3196+ refinedMetrics .OriginalStart = originalRegion .Start
3197+ refinedMetrics .OriginalDuration = originalRegion .Duration
3198+
3199+ // Replace the unrefined candidate in the list
3200+ for i := range result .Candidates {
3201+ if result .Candidates [i ].Region .Start == originalRegion .Start {
3202+ result .Candidates [i ] = * refinedMetrics
3203+ break
3204+ }
3205+ }
3206+
3207+ // Update best region to refined version
3208+ bestCandidate = refined
3209+ }
3210+ }
3211+ }
3212+ }
3213+
30293214 result .BestRegion = bestCandidate
30303215 return result
30313216}
0 commit comments