You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
27 lines
799 B
27 lines
799 B
"""Extract text content from PDF to plain text file."""
|
|
import sys
|
|
import fitz
|
|
|
|
src = r'C:\git\spark-lesson\reference\sources\non-equilibrium-air-plasmas-becker-kogelschatz.pdf'
|
|
dst = r'C:\git\spark-lesson\reference\sources\non-equilibrium-air-plasmas-becker-kogelschatz.txt'
|
|
|
|
doc = fitz.open(src)
|
|
print(f'Pages: {len(doc)}')
|
|
print(f'Title: {doc.metadata.get("title", "N/A")}')
|
|
print(f'Author: {doc.metadata.get("author", "N/A")}')
|
|
|
|
text = []
|
|
for i, page in enumerate(doc):
|
|
t = page.get_text()
|
|
if t.strip():
|
|
text.append(f'--- Page {i+1} ---\n{t}')
|
|
|
|
full = '\n'.join(text)
|
|
print(f'Total chars: {len(full):,}')
|
|
print(f'Estimated size: {len(full.encode("utf-8"))/1024/1024:.1f} MB')
|
|
|
|
with open(dst, 'w', encoding='utf-8') as f:
|
|
f.write(full)
|
|
|
|
print(f'Written to {dst}')
|
|
doc.close()
|