TTS fixes, Inworld improvements, footer redesign, episodes 15-25, invoice script fix

- Fix TTS text pipeline: new caps handling (spell out unknown acronyms, lowercase emphasis words), action-word lookahead for parenthetical stripping, abbreviation expansions (US→United States, NM→New Mexico), pronunciation fixes - Inworld TTS: camelCase API fields, speakingRate per-voice overrides, retry logic with exponential backoff (3 attempts) - Footer redesign: SVG icons for social/podcast links across all pages - Stats page: show "Rate us on Spotify" instead of "not public" placeholder - New voices, expanded caller prompts and problem scenarios - Social posting via Postiz, YouTube upload in publish pipeline - Episode transcripts 15-25, terms page, sitemap updates - Fix invoice script: match Timing totals using merged Task+App intervals Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 12:38:58 -07:00
parent 08a35bddeb
commit 6eeab58464
34 changed files with 6545 additions and 512 deletions
@@ -624,27 +624,25 @@ def _find_transcript_region(labeled_words: list[dict], whisper_words: list[str],
 def add_speaker_labels(words: list[dict], labeled_transcript: str,
                       start_time: float, end_time: float,
                       segments: list[dict]) -> list[dict]:
-    """Add speaker labels AND correct word text using labeled transcript.
+    """Replace Whisper text with labeled transcript text, keeping Whisper timestamps.

-    Uses Needleman-Wunsch DP alignment to match Whisper words to the labeled
-    transcript. This handles insertions/deletions gracefully — one missed word
-    becomes a single gap instead of cascading failures.
+    The labeled transcript is the source of truth for TEXT. Whisper is only used
+    for TIMESTAMPS. Uses DP alignment to map between the two, then rebuilds the
+    word list from the labeled transcript with interpolated timestamps for any
+    words Whisper missed.
    """
    if not labeled_transcript or not words:
        return words

-    # Parse full transcript into flat word list
    all_labeled = _parse_full_transcript(labeled_transcript)
    if not all_labeled:
        return words

-    # Build whisper clean word list
    whisper_clean = []
    for w in words:
        clean = re.sub(r"[^\w']", '', w["word"].lower())
        whisper_clean.append(clean if clean else w["word"].lower())

-    # Find the matching region in the transcript
    region = _find_transcript_region(all_labeled, whisper_clean)
    if region is None:
        return words
@@ -653,46 +651,61 @@ def add_speaker_labels(words: list[dict], labeled_transcript: str,
    region_words = all_labeled[region_start:region_end]
    region_clean = [w["clean"] for w in region_words]

-    # Run DP alignment
    pairs = _align_sequences(whisper_clean, region_clean)

-    # Build speaker assignments from aligned pairs
-    # matched[whisper_idx] = (labeled_word_dict, score)
-    matched = {}
+    # Build mapping: labeled_idx -> whisper_idx (for timestamp lookup)
+    labeled_to_whisper = {}
    for w_idx, l_idx in pairs:
        if w_idx is not None and l_idx is not None:
            score = _word_score(whisper_clean[w_idx], region_clean[l_idx])
            if score > 0:
-                matched[w_idx] = (region_words[l_idx], score)
+                labeled_to_whisper[l_idx] = w_idx

-    # Apply matches and interpolate speakers for gaps
+    # Find the range of labeled words that actually overlap with this clip
+    # Use only labeled indices that have a whisper match to determine boundaries
+    matched_labeled_indices = sorted(labeled_to_whisper.keys())
+    if not matched_labeled_indices:
+        return words
+
+    first_labeled = matched_labeled_indices[0]
+    last_labeled = matched_labeled_indices[-1]
+
+    # Build output from labeled transcript words with whisper timestamps
+    result = []
    corrections = 0
-    for i, word_entry in enumerate(words):
-        if i in matched:
-            labeled_word, score = matched[i]
-            word_entry["speaker"] = labeled_word["speaker"]
+    for l_idx in range(first_labeled, last_labeled + 1):
+        labeled_word = region_words[l_idx]
+        word_text = re.sub(r'[^\w\s\'-]', '', labeled_word["word"]).strip()
+        if not word_text:
+            continue

-            # Replace text only on confident matches
-            corrected = re.sub(r'[^\w\s\'-]', '', labeled_word["word"])
-            if corrected:
-                if corrected.lower() != whisper_clean[i]:
-                    corrections += 1
-                word_entry["word"] = corrected
+        if l_idx in labeled_to_whisper:
+            w_idx = labeled_to_whisper[l_idx]
+            ts_start = words[w_idx]["start"]
+            ts_end = words[w_idx]["end"]
+            if word_text.lower() != whisper_clean[w_idx]:
+                corrections += 1
        else:
-            # Interpolate speaker from nearest matched neighbor
-            speaker = _interpolate_speaker(i, matched, len(words))
-            if speaker:
-                word_entry["speaker"] = speaker
+            # Interpolate timestamp from neighbors
+            ts_start, ts_end = _interpolate_timestamp(l_idx, labeled_to_whisper, words)
+
+        result.append({
+            "word": word_text,
+            "start": ts_start,
+            "end": ts_end,
+            "speaker": labeled_word["speaker"],
+        })

    if corrections:
        print(f"      Corrected {corrections} words from labeled transcript")
+    if len(result) != len(words):
+        print(f"      Word count: {len(words)} (whisper) -> {len(result)} (labeled)")

-    return words
+    return result


 def _interpolate_speaker(idx: int, matched: dict, n_words: int) -> str | None:
    """Find speaker from nearest matched neighbor."""
-    # Search outward from idx
    for dist in range(1, n_words):
        before = idx - dist
        after = idx + dist
@@ -703,30 +716,63 @@ def _interpolate_speaker(idx: int, matched: dict, n_words: int) -> str | None:
    return None


-def polish_clip_words(words: list[dict], labeled_transcript: str = "") -> list[dict]:
-    """Use LLM to fix punctuation, capitalization, and misheard words.
+def _interpolate_timestamp(labeled_idx: int, labeled_to_whisper: dict,
+                           words: list[dict]) -> tuple[float, float]:
+    """Interpolate timestamp for a labeled word with no direct whisper match.

-    Sends the raw whisper words to an LLM, gets back a corrected version,
-    and maps corrections back to the original timed words.
+    Finds the nearest matched neighbors before and after, then linearly
+    interpolates based on position.
+    """
+    before_l = after_l = None
+    for dist in range(1, len(labeled_to_whisper) + 10):
+        if before_l is None and (labeled_idx - dist) in labeled_to_whisper:
+            before_l = labeled_idx - dist
+        if after_l is None and (labeled_idx + dist) in labeled_to_whisper:
+            after_l = labeled_idx + dist
+        if before_l is not None and after_l is not None:
+            break
+
+    if before_l is not None and after_l is not None:
+        w_before = words[labeled_to_whisper[before_l]]
+        w_after = words[labeled_to_whisper[after_l]]
+        span = after_l - before_l
+        frac = (labeled_idx - before_l) / span
+        start = w_before["end"] + frac * (w_after["start"] - w_before["end"])
+        duration = (w_after["start"] - w_before["end"]) / span
+        return start, start + max(duration, 0.1)
+    elif before_l is not None:
+        w = words[labeled_to_whisper[before_l]]
+        offset = (labeled_idx - before_l) * 0.3
+        return w["end"] + offset, w["end"] + offset + 0.3
+    elif after_l is not None:
+        w = words[labeled_to_whisper[after_l]]
+        offset = (after_l - labeled_idx) * 0.3
+        return w["start"] - offset - 0.3, w["start"] - offset
+    else:
+        return 0.0, 0.3
+
+
+def polish_clip_words(words: list[dict], labeled_transcript: str = "") -> list[dict]:
+    """Use LLM to add punctuation and fix capitalization.
+
+    The word text is already correct (from the labeled transcript). This step
+    only adds sentence punctuation and proper capitalization.
    """
    if not words or not OPENROUTER_API_KEY:
        return words

    raw_text = " ".join(w["word"] for w in words)

-    context = ""
-    if labeled_transcript:
-        context = f"\nFor reference, here's the speaker-labeled transcript of this section (use it to correct misheard words and names):\n{labeled_transcript[:3000]}\n"
-
-    prompt = f"""Fix this podcast transcript excerpt so it reads as proper sentences. Fix punctuation, capitalization, and obvious misheard words.
+    prompt = f"""Add punctuation and capitalization to this podcast transcript excerpt so it reads as proper sentences.

 RULES:
 - Keep the EXACT same number of words in the EXACT same order
- Only change capitalization, punctuation attached to words, and obvious mishearings
+- The words themselves are already correct — do NOT change any word's spelling
+- Only add punctuation (periods, commas, question marks, exclamation marks) and fix capitalization
 - Do NOT add, remove, merge, or reorder words
 - Contractions count as one word (don't = 1 word)
 - Return ONLY the corrected text, nothing else
-{context}
+
 RAW TEXT ({len(words)} words):
 {raw_text}"""