"""Extract text content from PDF to plain text file.""" import sys import fitz src = r'C:\git\spark-lesson\reference\sources\non-equilibrium-air-plasmas-becker-kogelschatz.pdf' dst = r'C:\git\spark-lesson\reference\sources\non-equilibrium-air-plasmas-becker-kogelschatz.txt' doc = fitz.open(src) print(f'Pages: {len(doc)}') print(f'Title: {doc.metadata.get("title", "N/A")}') print(f'Author: {doc.metadata.get("author", "N/A")}') text = [] for i, page in enumerate(doc): t = page.get_text() if t.strip(): text.append(f'--- Page {i+1} ---\n{t}') full = '\n'.join(text) print(f'Total chars: {len(full):,}') print(f'Estimated size: {len(full.encode("utf-8"))/1024/1024:.1f} MB') with open(dst, 'w', encoding='utf-8') as f: f.write(full) print(f'Written to {dst}') doc.close()