告別「復制+粘貼」,基于深度學習的 OCR,實現(xiàn) PDF 轉文本


將 pdf 轉換為圖片;
檢測和識別圖像中的文本;
展示示例輸出。

from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
pdf_path = "path/to/file/intro_RL_Lecture1.pdf"
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
fname = "image" + str(i) + ".png"
image.save(fname, "PNG")

# adapted from this source: https://github.com/courao/ocr.pytorch
%load_ext autoreload
%autoreload 2
import os
from ocr import ocr
import time
import shutil
import numpy as np
import pathlib
from PIL import Image
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pytesseract
def single_pic_proc(image_file):
image = np.array(Image.open(image_file).convert('RGB'))
result, image_framed = ocr(image)
return result,image_framed
image_files = glob('./input_images/*.*')
result_dir = './output_images_with_boxes/'
# If the output folder exists we will remove it and redo it.
if os.path.exists(result_dir):
shutil.rmtree(result_dir)
os.mkdir(result_dir)
for image_file in sorted(image_files):
result, image_framed = single_pic_proc(image_file) # detecting and recognizing the text
filename = pathlib.Path(image_file).name
output_file = os.path.join(result_dir, image_file.split('/')[-1])
txt_file = os.path.join(result_dir, image_file.split('/')[-1].split('.')[0]+'.txt')
txt_f = open(txt_file, 'w')
Image.fromarray(image_framed).save(output_file)
for key in result:
txt_f.write(result[key][1]+'\n')
txt_f.close()
import cv2 as cv
output_dir = pathlib.Path("./output_images_with_boxes")
# image = cv.imread(str(np.random.choice(list(output_dir.iterdir()),1)[0]))
image = cv.imread(f"{output_dir}/image7.png")
size_reshaped = (int(image.shape[1]),int(image.shape[0]))
image = cv.resize(image, size_reshaped)
cv.imshow("image", image)
cv.waitKey(0)
cv.destroyAllWindows()

filename = f"{output_dir}/image7.txt"
with open(filename, "r") as text:
for line in text.readlines():
print(line.strip("\n"))
福利時間
近日吳恩達新書《Machine Learning Yearning》中文版開放下載!

《Machine Learning Yearning》是吳恩達歷時兩年,根據(jù)自己多年實踐經驗整理出來的一本機器學習、深度學習實踐經驗寶典。里面講的機器學習課程非常棒,很適合數(shù)學基礎不是很好的人自學,最近中文版也開放下載閱讀了!
如何下載?
1. 識別并關注下方公眾號;
2. 在下面公眾號(非本號)后臺回復關鍵字「吳恩達」。
評論
圖片
表情
