You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
31 lines
1020 B
31 lines
1020 B
import fitz
|
|
import sys
|
|
|
|
files = [
|
|
(r'C:\git\spark-lesson\reference\sources\bazelyan-noaa-preprint.pdf',
|
|
r'C:\git\spark-lesson\reference\sources\bazelyan-noaa-preprint.txt'),
|
|
(r'C:\git\spark-lesson\reference\sources\plasma-nature-lightning-channels.pdf',
|
|
r'C:\git\spark-lesson\reference\sources\plasma-nature-lightning-channels.txt'),
|
|
]
|
|
|
|
for pdf_path, txt_path in files:
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
print(f'\n=== {pdf_path} ===')
|
|
print(f'Pages: {len(doc)}')
|
|
|
|
output = []
|
|
for i in range(len(doc)):
|
|
text = doc[i].get_text()
|
|
if text.strip():
|
|
output.append(f'=== PAGE {i+1} ===')
|
|
output.append(text)
|
|
|
|
full_text = '\n'.join(output)
|
|
with open(txt_path, 'w', encoding='utf-8') as f:
|
|
f.write(full_text)
|
|
|
|
print(f'Extracted {len(output)//2} pages')
|
|
print(f'Total characters: {len(full_text)}')
|
|
except Exception as e:
|
|
print(f'Error with {pdf_path}: {e}')
|