Files
the-testament/audiobook/extract_text.py

40 lines
1.1 KiB
Python

#!/usr/bin/env python3
import sys
import re
def extract_text(filepath, word_limit=350):
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Skip header line (first line)
# Skip empty lines at start
text_lines = []
started = False
for line in lines[1:]:
stripped = line.strip()
if stripped:
started = True
if started:
text_lines.append(stripped)
# Join lines with spaces
text = ' '.join(text_lines)
# Collapse multiple spaces
text = re.sub(r'\s+', ' ', text)
# Take first word_limit words
words = text.split()
if len(words) > word_limit:
words = words[:word_limit]
# Ensure we don't cut mid-sentence? Not required.
return ' '.join(words)
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Usage: extract_text.py <input.md> <output.txt>")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
text = extract_text(input_file)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(text)
print(f"Extracted {len(text.split())} words to {output_file}")