告別復(fù)制粘貼,Python 實(shí)現(xiàn) PDF 轉(zhuǎn)文本
↑?關(guān)注 + 星標(biāo)?,每天學(xué)Python新技能
后臺回復(fù)【大禮包】送你Python自學(xué)大禮包
機(jī)器之心編譯

將 pdf 轉(zhuǎn)換為圖片;
檢測和識別圖像中的文本;
展示示例輸出。

from?pdf2image?import?convert_from_path
from?pdf2image.exceptions?import?(
?PDFInfoNotInstalledError,
?PDFPageCountError,
?PDFSyntaxError
)
pdf_path?=?"path/to/file/intro_RL_Lecture1.pdf"
images?=?convert_from_path(pdf_path)
for?i,?image?in?enumerate(images):
????fname?=?"image"?+?str(i)?+?".png"
????image.save(fname,?"PNG")

#?adapted?from?this?source:?https://github.com/courao/ocr.pytorch
%load_ext?autoreload
%autoreload?2
import?os
from?ocr?import?ocr
import?time
import?shutil
import?numpy?as?np
import?pathlib
from?PIL?import?Image
from?glob?import?glob
import?matplotlib.pyplot?as?plt
import?seaborn?as?sns
sns.set()
import?pytesseract
def?single_pic_proc(image_file):
????image?=?np.array(Image.open(image_file).convert('RGB'))
????result,?image_framed?=?ocr(image)
????return?result,image_framed
image_files?=?glob('./input_images/*.*')
result_dir?=?'./output_images_with_boxes/'
#?If?the?output?folder?exists?we?will?remove?it?and?redo?it.
if?os.path.exists(result_dir):
????shutil.rmtree(result_dir)
os.mkdir(result_dir)
for?image_file?in?sorted(image_files):
????result,?image_framed?=?single_pic_proc(image_file)?#?detecting?and?recognizing?the?text
????filename?=?pathlib.Path(image_file).name
????output_file?=?os.path.join(result_dir,?image_file.split('/')[-1])
????txt_file?=?os.path.join(result_dir,?image_file.split('/')[-1].split('.')[0]+'.txt')
????txt_f?=?open(txt_file,?'w')
????Image.fromarray(image_framed).save(output_file)
????for?key?in?result:
????????txt_f.write(result[key][1]+'\n')
????txt_f.close()
import?cv2?as?cv
output_dir?=?pathlib.Path("./output_images_with_boxes")
#?image?=?cv.imread(str(np.random.choice(list(output_dir.iterdir()),1)[0]))
image?=?cv.imread(f"{output_dir}/image7.png")
size_reshaped?=?(int(image.shape[1]),int(image.shape[0]))
image?=?cv.resize(image,?size_reshaped)
cv.imshow("image",?image)
cv.waitKey(0)
cv.destroyAllWindows()

filename?=?f"{output_dir}/image7.txt"
with?open(filename,?"r")?as?text:
????for?line?in?text.readlines():
????????print(line.strip("\n"))
原文鏈接:https://towardsdatascience.com/faster-notes-with-python-and-deep-learning-b713bbb3c186
推薦閱讀
您看此文用? ?
?分?
?
秒,轉(zhuǎn)發(fā)只需1秒哦
評論
圖片
表情

?
?分?
?
秒,轉(zhuǎn)發(fā)只需1秒哦