40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
#!/usr/bin/env python3
|
|
import sys
|
|
import re
|
|
|
|
def extract_text(filepath, word_limit=350):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
# Skip header line (first line)
|
|
# Skip empty lines at start
|
|
text_lines = []
|
|
started = False
|
|
for line in lines[1:]:
|
|
stripped = line.strip()
|
|
if stripped:
|
|
started = True
|
|
if started:
|
|
text_lines.append(stripped)
|
|
|
|
# Join lines with spaces
|
|
text = ' '.join(text_lines)
|
|
# Collapse multiple spaces
|
|
text = re.sub(r'\s+', ' ', text)
|
|
# Take first word_limit words
|
|
words = text.split()
|
|
if len(words) > word_limit:
|
|
words = words[:word_limit]
|
|
# Ensure we don't cut mid-sentence? Not required.
|
|
return ' '.join(words)
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 3:
|
|
print("Usage: extract_text.py <input.md> <output.txt>")
|
|
sys.exit(1)
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2]
|
|
text = extract_text(input_file)
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(text)
|
|
print(f"Extracted {len(text.split())} words to {output_file}") |