You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

31 lines
1020 B

import fitz
import sys
files = [
(r'C:\git\spark-lesson\reference\sources\bazelyan-noaa-preprint.pdf',
r'C:\git\spark-lesson\reference\sources\bazelyan-noaa-preprint.txt'),
(r'C:\git\spark-lesson\reference\sources\plasma-nature-lightning-channels.pdf',
r'C:\git\spark-lesson\reference\sources\plasma-nature-lightning-channels.txt'),
]
for pdf_path, txt_path in files:
try:
doc = fitz.open(pdf_path)
print(f'\n=== {pdf_path} ===')
print(f'Pages: {len(doc)}')
output = []
for i in range(len(doc)):
text = doc[i].get_text()
if text.strip():
output.append(f'=== PAGE {i+1} ===')
output.append(text)
full_text = '\n'.join(output)
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(full_text)
print(f'Extracted {len(output)//2} pages')
print(f'Total characters: {len(full_text)}')
except Exception as e:
print(f'Error with {pdf_path}: {e}')