import fitz import sys files = [ (r'C:\git\spark-lesson\reference\sources\bazelyan-noaa-preprint.pdf', r'C:\git\spark-lesson\reference\sources\bazelyan-noaa-preprint.txt'), (r'C:\git\spark-lesson\reference\sources\plasma-nature-lightning-channels.pdf', r'C:\git\spark-lesson\reference\sources\plasma-nature-lightning-channels.txt'), ] for pdf_path, txt_path in files: try: doc = fitz.open(pdf_path) print(f'\n=== {pdf_path} ===') print(f'Pages: {len(doc)}') output = [] for i in range(len(doc)): text = doc[i].get_text() if text.strip(): output.append(f'=== PAGE {i+1} ===') output.append(text) full_text = '\n'.join(output) with open(txt_path, 'w', encoding='utf-8') as f: f.write(full_text) print(f'Extracted {len(output)//2} pages') print(f'Total characters: {len(full_text)}') except Exception as e: print(f'Error with {pdf_path}: {e}')