Skip to content

Commit a613feb

Browse files
committed
feat(processor): refine long speech regions to golden sub-window
- Add golden sub-region refinement: scoreSpeechIntervalWindow and refineToGoldenSpeechSubregion to isolate the best continuous 60s window inside long speech candidates. - Introduce constants for golden window sizing and add SpeechCandidateMetrics fields: OriginalStart, OriginalDuration, WasRefined. - Modify findBestSpeechRegion to refine candidates > 60s, re-measure the refined region, preserve original metadata, and replace candidate metrics. - Update diagnostic logging to show refinement details for selected candidates. - Add comprehensive unit tests for scoring, refinement, and integration with findBestSpeechRegion. Improve spectral measurement accuracy for long speech regions so adaptive filter tuning uses a representative continuous speech window rather than averaging across pauses. Signed-off-by: Martin Wimpress <martin@wimpress.org>
1 parent 6849db7 commit a613feb

3 files changed

Lines changed: 578 additions & 2 deletions

File tree

internal/logging/report.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2162,12 +2162,29 @@ func writeDiagnosticSpeech(f *os.File, measurements *processor.AudioMeasurements
21622162

21632163
for i, c := range measurements.SpeechCandidates {
21642164
// Check if this candidate was selected
2165-
isSelected := measurements.SpeechProfile != nil &&
2166-
c.Region.Start == measurements.SpeechProfile.Region.Start
2165+
// For refined regions, compare against original start since Region.Start is the refined position
2166+
isSelected := false
2167+
if measurements.SpeechProfile != nil {
2168+
if c.WasRefined {
2169+
isSelected = c.OriginalStart == measurements.SpeechProfile.OriginalStart
2170+
} else {
2171+
isSelected = c.Region.Start == measurements.SpeechProfile.Region.Start
2172+
}
2173+
}
21672174

21682175
if isSelected {
21692176
fmt.Fprintf(f, " Candidate %d: %.1fs at %.1fs (score: %.3f) [SELECTED]\n",
21702177
i+1, c.Region.Duration.Seconds(), c.Region.Start.Seconds(), c.Score)
2178+
2179+
// Show refinement details if this candidate was refined to a golden sub-region
2180+
if c.WasRefined {
2181+
fmt.Fprintf(f, " Refined: %.1fs at %.1fs → %.1fs at %.1fs (golden sub-region)\n",
2182+
c.OriginalDuration.Seconds(),
2183+
c.OriginalStart.Seconds(),
2184+
c.Region.Duration.Seconds(),
2185+
c.Region.Start.Seconds())
2186+
}
2187+
21712188
fmt.Fprintf(f, " Amplitude:\n")
21722189
fmt.Fprintf(f, " RMS Level: %.1f dBFS\n", c.RMSLevel)
21732190
fmt.Fprintf(f, " Peak Level: %.1f dBFS\n", c.PeakLevel)

internal/processor/analyzer.go

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ type SpeechCandidateMetrics struct {
135135

136136
// Scoring
137137
Score float64 `json:"score"` // Composite score for candidate ranking
138+
139+
// Golden sub-region refinement info (populated when a long candidate is refined)
140+
OriginalStart time.Duration `json:"original_start,omitempty"` // Original candidate start before refinement
141+
OriginalDuration time.Duration `json:"original_duration,omitempty"` // Original candidate duration before refinement
142+
WasRefined bool `json:"was_refined,omitempty"` // True if region was refined from a longer candidate
138143
}
139144

140145
// IntervalSample contains all measurements for a 250ms audio window.
@@ -530,6 +535,12 @@ const (
530535
// speechEntropyMax is the maximum entropy for speech (structured signal).
531536
// Pure noise approaches 1.0; speech is typically 0.3-0.7.
532537
speechEntropyMax = 0.85
538+
539+
// Golden speech region refinement constants
540+
// After selecting the best speech candidate, refine to a representative sub-window
541+
// to avoid averaging across pauses that contaminate spectral metrics.
542+
goldenSpeechWindowDuration = 60 * time.Second // Target: 60s of representative speech
543+
goldenSpeechWindowMinimum = 30 * time.Second // Minimum acceptable window
533544
)
534545

535546
// refineToGoldenSubregion finds the cleanest sub-region within a silence candidate.
@@ -788,6 +799,143 @@ func scoreIntervalWindow(intervals []IntervalSample) float64 {
788799
return sumRMS / float64(len(intervals))
789800
}
790801

802+
// scoreSpeechIntervalWindow calculates a quality score for a contiguous window of speech intervals.
803+
// Returns a 0-1 score where higher = better quality speech for profiling.
804+
// Scores based on spectral characteristics that indicate clear, continuous speech:
805+
// - Kurtosis (0.20): higher average = clearer harmonics
806+
// - Flatness (0.20): lower = more tonal (inverted)
807+
// - Centroid (0.20): peak at voice centre (~2000 Hz), decay toward edges
808+
// - Consistency (0.25): low kurtosis variance = stable voicing
809+
// - RMS (0.15): louder = more active speech
810+
func scoreSpeechIntervalWindow(intervals []IntervalSample) float64 {
811+
if len(intervals) == 0 {
812+
return 0 // Should not happen in normal use
813+
}
814+
815+
n := float64(len(intervals))
816+
817+
// Accumulate metrics
818+
var kurtosisSum, flatnessSum, centroidSum, rmsSum float64
819+
kurtosisValues := make([]float64, len(intervals))
820+
821+
for i, interval := range intervals {
822+
kurtosisSum += interval.SpectralKurtosis
823+
flatnessSum += interval.SpectralFlatness
824+
centroidSum += interval.SpectralCentroid
825+
rmsSum += interval.RMSLevel
826+
kurtosisValues[i] = interval.SpectralKurtosis
827+
}
828+
829+
avgKurtosis := kurtosisSum / n
830+
avgFlatness := flatnessSum / n
831+
avgCentroid := centroidSum / n
832+
avgRMS := rmsSum / n
833+
834+
// Calculate kurtosis variance for consistency score
835+
var kurtosisVarianceSum float64
836+
for _, k := range kurtosisValues {
837+
diff := k - avgKurtosis
838+
kurtosisVarianceSum += diff * diff
839+
}
840+
kurtosisVariance := kurtosisVarianceSum / n
841+
842+
// Kurtosis score (0.20): higher kurtosis = clearer harmonics
843+
// Typical speech kurtosis ranges 2-10; score peaks around 6
844+
kurtosisScore := clampFloat(avgKurtosis/6.0, 0.0, 1.0)
845+
846+
// Flatness score (0.20): lower flatness = more tonal = better speech
847+
// Flatness 0 = pure tone, 1 = white noise; speech typically 0.1-0.4
848+
flatnessScore := clampFloat(1.0-avgFlatness, 0.0, 1.0)
849+
850+
// Centroid score (0.20): peak at voice centre, decay toward edges
851+
// Voice range: speechCentroidMin (200 Hz) to speechCentroidMax (4500 Hz)
852+
centroidScore := 0.0
853+
if avgCentroid >= speechCentroidMin && avgCentroid <= speechCentroidMax {
854+
// Calculate distance from ideal centre (~2000 Hz)
855+
voiceMid := (speechCentroidMin + speechCentroidMax) / 2
856+
voiceHalfWidth := (speechCentroidMax - speechCentroidMin) / 2
857+
distFromMid := math.Abs(avgCentroid - voiceMid)
858+
// Score decays to 0.5 at edges, 1.0 at centre
859+
centroidScore = 1.0 - (distFromMid/voiceHalfWidth)*0.5
860+
}
861+
862+
// Consistency score (0.25): low kurtosis variance = stable voicing
863+
// Variance > 100 is very inconsistent; clamp score at that point
864+
consistencyScore := clampFloat(1.0-(kurtosisVariance/100.0), 0.0, 1.0)
865+
866+
// RMS score (0.15): louder = more active speech
867+
// Range: -30 dBFS (worst) to -12 dBFS (best)
868+
rmsScore := 0.0
869+
if avgRMS > -30.0 {
870+
rmsScore = clampFloat((avgRMS-(-30.0))/18.0, 0.0, 1.0)
871+
}
872+
873+
// Weighted combination (per plan: consistency elevated to 0.25)
874+
return kurtosisScore*0.20 + flatnessScore*0.20 + centroidScore*0.20 + consistencyScore*0.25 + rmsScore*0.15
875+
}
876+
877+
// refineToGoldenSpeechSubregion finds the most representative sub-region within a speech candidate.
878+
// Uses existing interval samples to find the window with highest speech quality score.
879+
// Returns the original region if it's already at or below goldenSpeechWindowDuration,
880+
// or if refinement fails for any reason (insufficient intervals, etc.).
881+
//
882+
// This addresses cases where a long speech region contains pauses that contaminate
883+
// spectral metrics when averaged. By refining to the best 60s window, we isolate
884+
// continuous speech for more accurate adaptive filter tuning.
885+
func refineToGoldenSpeechSubregion(candidate *SpeechRegion, intervals []IntervalSample) *SpeechRegion {
886+
if candidate == nil {
887+
return nil
888+
}
889+
890+
// No refinement needed if already at or below target duration
891+
if candidate.Duration <= goldenSpeechWindowDuration {
892+
return candidate
893+
}
894+
895+
// Extract intervals within the candidate's time range
896+
candidateIntervals := getIntervalsInRange(intervals, candidate.Start, candidate.End)
897+
if candidateIntervals == nil {
898+
return candidate
899+
}
900+
901+
// Calculate window size in intervals (60s / 250ms = 240 intervals)
902+
windowIntervals := int(goldenSpeechWindowDuration / goldenIntervalSize)
903+
minimumIntervals := int(goldenSpeechWindowMinimum / goldenIntervalSize)
904+
905+
// Need at least minimum window worth of intervals
906+
if len(candidateIntervals) < minimumIntervals {
907+
return candidate
908+
}
909+
910+
// If we have fewer intervals than target window, use what we have
911+
if len(candidateIntervals) < windowIntervals {
912+
windowIntervals = len(candidateIntervals)
913+
}
914+
915+
// Slide window across intervals, finding position with highest speech quality score
916+
bestStartIdx := 0
917+
bestScore := scoreSpeechIntervalWindow(candidateIntervals[:windowIntervals])
918+
919+
for startIdx := 1; startIdx <= len(candidateIntervals)-windowIntervals; startIdx++ {
920+
windowScore := scoreSpeechIntervalWindow(candidateIntervals[startIdx : startIdx+windowIntervals])
921+
if windowScore > bestScore {
922+
bestScore = windowScore
923+
bestStartIdx = startIdx
924+
}
925+
}
926+
927+
// Calculate refined region bounds from the best window position
928+
refinedStart := candidateIntervals[bestStartIdx].Timestamp
929+
refinedDuration := time.Duration(windowIntervals) * goldenIntervalSize
930+
refinedEnd := refinedStart + refinedDuration
931+
932+
return &SpeechRegion{
933+
Start: refinedStart,
934+
End: refinedEnd,
935+
Duration: refinedDuration,
936+
}
937+
}
938+
791939
// roomToneScore calculates a 0-1 score indicating how likely an interval is room tone.
792940
// Room tone has characteristic spectral behaviour:
793941
// - Low SpectralFlux (stable, not changing)
@@ -2992,6 +3140,8 @@ type findBestSpeechRegionResult struct {
29923140
// findBestSpeechRegion selects the best speech region for measurements.
29933141
// Strategy: prefer longest duration that meets quality threshold.
29943142
// Unlike silence (where earlier is better), speech benefits from longer samples.
3143+
// For long candidates (>60s), refines to the best 60s sub-region to avoid
3144+
// contaminating spectral metrics with pauses.
29953145
func findBestSpeechRegion(regions []SpeechRegion, intervals []IntervalSample) *findBestSpeechRegionResult {
29963146
result := &findBestSpeechRegionResult{}
29973147

@@ -3026,6 +3176,41 @@ func findBestSpeechRegion(regions []SpeechRegion, intervals []IntervalSample) *f
30263176
}
30273177
}
30283178

3179+
// Refine long candidates to golden sub-region
3180+
if bestCandidate != nil && bestCandidate.Duration > goldenSpeechWindowDuration {
3181+
originalRegion := *bestCandidate
3182+
refined := refineToGoldenSpeechSubregion(bestCandidate, intervals)
3183+
3184+
if refined != nil {
3185+
wasRefined := refined.Start != originalRegion.Start ||
3186+
refined.Duration != originalRegion.Duration
3187+
3188+
if wasRefined {
3189+
// Re-measure the refined region
3190+
refinedMetrics := measureSpeechCandidateFromIntervals(*refined, intervals)
3191+
if refinedMetrics != nil {
3192+
refinedMetrics.Score = scoreSpeechCandidate(refinedMetrics)
3193+
3194+
// Store refinement metadata
3195+
refinedMetrics.WasRefined = true
3196+
refinedMetrics.OriginalStart = originalRegion.Start
3197+
refinedMetrics.OriginalDuration = originalRegion.Duration
3198+
3199+
// Replace the unrefined candidate in the list
3200+
for i := range result.Candidates {
3201+
if result.Candidates[i].Region.Start == originalRegion.Start {
3202+
result.Candidates[i] = *refinedMetrics
3203+
break
3204+
}
3205+
}
3206+
3207+
// Update best region to refined version
3208+
bestCandidate = refined
3209+
}
3210+
}
3211+
}
3212+
}
3213+
30293214
result.BestRegion = bestCandidate
30303215
return result
30313216
}

0 commit comments

Comments
 (0)