太方便了!告別復(fù)制粘貼,Python 輕松實(shí)現(xiàn) PDF 轉(zhuǎn)文本!


將 pdf 轉(zhuǎn)換為圖片; 檢測(cè)和識(shí)別圖像中的文本; 展示示例輸出。
基于深度學(xué)習(xí)的 OCR 將 pdf 轉(zhuǎn)錄為文本

from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
pdf_path = "path/to/file/intro_RL_Lecture1.pdf"
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
fname = "image" + str(i) + ".png"
image.save(fname, "PNG")

檢測(cè)和識(shí)別圖像中的文本
# adapted from this source: https://github.com/courao/ocr.pytorch
%load_ext autoreload
%autoreload 2
import os
from ocr import ocr
import time
import shutil
import numpy as np
import pathlib
from PIL import Image
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pytesseract
def single_pic_proc(image_file):
image = np.array(Image.open(image_file).convert('RGB'))
result, image_framed = ocr(image)
return result,image_framed
image_files = glob('./input_images/*.*')
result_dir = './output_images_with_boxes/'
# If the output folder exists we will remove it and redo it.
if os.path.exists(result_dir):
shutil.rmtree(result_dir)
os.mkdir(result_dir)
for image_file in sorted(image_files):
result, image_framed = single_pic_proc(image_file) # detecting and recognizing the text
filename = pathlib.Path(image_file).name
output_file = os.path.join(result_dir, image_file.split('/')[-1])
txt_file = os.path.join(result_dir, image_file.split('/')[-1].split('.')[0]+'.txt')
txt_f = open(txt_file, 'w')
Image.fromarray(image_framed).save(output_file)
for key in result:
txt_f.write(result[key][1]+'\n')
txt_f.close()
示例輸出
import cv2 as cv
output_dir = pathlib.Path("./output_images_with_boxes")
# image = cv.imread(str(np.random.choice(list(output_dir.iterdir()),1)[0]))
image = cv.imread(f"{output_dir}/image7.png")
size_reshaped = (int(image.shape[1]),int(image.shape[0]))
image = cv.resize(image, size_reshaped)
cv.imshow("image", image)
cv.waitKey(0)
cv.destroyAllWindows()

filename = f"{output_dir}/image7.txt"
with open(filename, "r") as text:
for line in text.readlines():
print(line.strip("\n"))
搜索下方加老師微信
老師微信號(hào):XTUOL1988【切記備注:學(xué)習(xí)Python】
領(lǐng)取Python web開(kāi)發(fā),Python爬蟲(chóng),Python數(shù)據(jù)分析,人工智能等精品學(xué)習(xí)課程。帶你從零基礎(chǔ)系統(tǒng)性的學(xué)好Python!
*聲明:本文于網(wǎng)絡(luò)整理,版權(quán)歸原作者所有,如來(lái)源信息有誤或侵犯權(quán)益,請(qǐng)聯(lián)系我們刪除或授權(quán)


