the-testament/audiobook/extract_text.py

#!/usr/bin/env python3
import sys
import re

def extract_text(filepath, word_limit=350):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Skip header line (first line)
    # Skip empty lines at start
    text_lines = []
    started = False
    for line in lines[1:]:
        stripped = line.strip()
        if stripped:
            started = True
        if started:
            text_lines.append(stripped)

    # Join lines with spaces
    text = ' '.join(text_lines)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Take first word_limit words
    words = text.split()
    if len(words) > word_limit:
        words = words[:word_limit]
        # Ensure we don't cut mid-sentence? Not required.
    return ' '.join(words)

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: extract_text.py <input.md> <output.txt>")
        sys.exit(1)
    input_file = sys.argv[1]
    output_file = sys.argv[2]
    text = extract_text(input_file)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"Extracted {len(text.split())} words to {output_file}")