#Project#基于python的PDF文本翻譯實作
共 10391字,需瀏覽 21分鐘
·
2024-05-15 22:30
“ 文章所涉及內(nèi)容更多來自網(wǎng)絡(luò),在此聲明,并感謝知識的貢獻者!”
項目實現(xiàn)目標
—
項目實現(xiàn)目標:
1 實現(xiàn)PDF文件轉(zhuǎn)成TXT文件
2 實現(xiàn)識別圖片中的文字,并輸出TXT文件
3 基于有道詞典進行查詞
項目所在環(huán)境
—
項目所在環(huán)境:
Operation system:Centos 7
Python Version: python 3.6.1
項目環(huán)境初始
—
項目環(huán)境初始:
Centos下的安裝指令:
yum update
yum groupinstall “Development tools”
yum -y install automake autoconf libtool zlib-devel libjpeg-devel giflib libtiff-devel libwebp libwebp-devel libicu-devel openjpeg-devel cairo-devel
yum install gcc
pip3 install wand
pip3 install pytesseract
pip3 install pillow
pip3 install tesseract
wget https://github.com/tesseract-ocr/tesseract/archive/3.04.01.tar.gz
mv 3.04.01.tar.gz tesseract-3.04.01.tar.gz
tar xzvf tesseract-3.04.01.tar.gz
cd tesseract-3.04.01/
./autogen.sh
./configure
make
make install
ldconfig
pip3 install pyocr
yum install python-imaging
yum install ImageMagick-devel
export TESSDATA_PREFIX=/usr/local/share/tessdata
pip3 install pdfminer.six
pip3 install urllib
項目文件架構(gòu)
—
項目文件架構(gòu):
項目Python源碼
—
項目Python源碼:
# -*- encoding: utf-8 -*-
import os
import io
import json
from wand.image import Image
from PIL import Image as PI
from PIL import ImageEnhance
import pyocr
import pyocr.builders
import re
import pytesseract as ocr
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import *
import urllib.request
import urllib.parse
#獲取指定文件夾下的所有文件名
def get_files(file_folder):
path = os.getcwd() + '/' + file_folder + '/'
files = []
for file_name in os.listdir(path):
files.append(path + file_name)
return files
#獲取圖片式PDF文件的內(nèi)容
def pdf_image_to_string(pdf_file,lang_code=0):
# pyocr支持兩種OCR庫,由于我只安裝了tesseract,只會獲得tesseract
tool = pyocr.get_available_tools()[0]
# 選擇要使用的語言,使用print tool.get_available_languages()列表
lang = tool.get_available_languages()[lang_code]
# 用來保存圖像和對應(yīng)的文字
req_image = []
final_text = []
# 打開pdf文件,并轉(zhuǎn)為圖像,替換./test.pdf
image_pdf = Image(filename=pdf_file, resolution=300)
image_jpeg = image_pdf.convert('jpeg')
# 把圖片放到req_image中
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
# 為每個圖像運行OCR,識別圖像中的文本
for img in req_image:
txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang,builder=pyocr.builders.TextBuilder())
final_text.append(txt)
return final_text
#獲取可讀PDF的文本內(nèi)容
def pdf_text_to_string(pdf_file):
final_text=[]
#打開一個pdf文件
fp = open(pdf_file, 'rb')
#創(chuàng)建一個PDF文檔解析器對象
parser = PDFParser(fp)
#創(chuàng)建一個PDF文檔對象存儲文檔結(jié)構(gòu)
#提供密碼初始化,沒有就不用傳該參數(shù)
#document = PDFDocument(parser, password)
document = PDFDocument(parser)
#檢查文件是否允許文本提取
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
#創(chuàng)建一個PDF資源管理器對象來存儲共享資源
#caching = False不緩存
rsrcmgr = PDFResourceManager(caching = False)
# 創(chuàng)建一個PDF設(shè)備對象
laparams = LAParams()
# 創(chuàng)建一個PDF頁面聚合對象
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
#創(chuàng)建一個PDF解析器對象
interpreter = PDFPageInterpreter(rsrcmgr, device)
#處理文檔當中的每個頁面
# doc.get_pages() 獲取page列表
#for i, page in enumerate(document.get_pages()):
#PDFPage.create_pages(document) 獲取page列表的另一種方式
replace=re.compile(r'\s+')
# 循環(huán)遍歷列表,每次處理一個page的內(nèi)容
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受該頁面的LTPage對象
layout=device.get_result()
# 這里layout是一個LTPage對象 里面存放著 這個page解析出的各種對象
# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
for x in layout:
#如果x是水平文本對象的話
if(isinstance(x,LTTextBoxHorizontal)):
text=re.sub(replace,'',x.get_text())
if len(text)!=0:
final_text.append(text)
return final_text
#使用pyocr 提取圖片中的文字
def pyocr_image_to_string(img_file,lang_code=0):
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[lang_code]
text = tool.image_to_string(PI.open(img_file), lang=lang,builder=pyocr.builders.TextBuilder())
return text
#使用pytesseract 提取圖片中的文字
def pytess_imgae_to_string(img_file):
img = PI.open(img_file)
text=ocr.image_to_string(img, lang='chi_sim')
return text
#使用ImageEnhance增強提取圖片中的文字
def enhance_image_to_string(img_file):
image = PI.open(img_file)
# 使用ImageEnhance可以增強圖片的識別率
enhancer = ImageEnhance.Contrast(image)
image_enhancer = enhancer.enhance(4)
text = ocr.image_to_string(image_enhancer, lang='chi_sim')
return text
#模擬瀏覽器使用有道進行翻譯
def youdao_html_translate(text,url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&sessionFrom="):
data = {}
data['i'] = text
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult']='dict'
data['client'] = 'fanyideskweb'
data['salt'] = '1503581407033'
data['sign'] = '67472a1b3638989677f7aca9af3be0aa'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data['typoResult'] = 'true'
data = urllib.parse.urlencode(data).encode('utf-8')
proxy_support = urllib.request.ProxyHandler({"https": "222.161.16.10:9999"})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')]
urllib.request.install_opener(opener)
req = urllib.request.Request(url, data)
# req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
tar = json.loads(html)
#print("翻譯結(jié)果是:%s" % tar['translateResult'][0][0]['tgt'])
return tar['translateResult'][0][0]['tgt']
#寫入txt文本
def string_to_text(file,text):
try:
f = open(file,"w")
f.write(text)
finally:
if f:
f.close()
def translator(file_folder,file_type):
text=[]
#提取PDF的文字
if file_type == 'PDF':
pdf_files=get_files(file_folder)
text.append(pdf_image_to_string(pdf_files[0], lang_code=0))
text.append(pdf_text_to_string(pdf_files[1]))
# 提取圖片的文字
if file_type == 'IMG':
img_files = get_files(file_folder)
text.append(pyocr_image_to_string(img_files[0]))
text.append(pytess_imgae_to_string(img_files[0]))
text.append(enhance_image_to_string(img_files[0]))
if len(text)==0:
text.append('翻譯測試')
translated_text = youdao_html_translate(text=text[0])
string_to_text(file= os.getcwd() + '/txt_files/test.txt', text= translated_text)
print(translated_text)
def main():
translator(file_folder='empty_files',file_type='TEST')
項目參考資料
—
項目參考資料:
http://www.wisedream.net/2016/07/18/imgProcessing/ocr-with-pytesseract/
https://ivanzz1001.github.io/records/post/ocr/2017/09/08/tesseract-install
http://blog.topspeedsnail.com/archives/3571
http://blog.csdn.net/fighting_no1/article/details/51038942
http://blog.csdn.net/qq_21905401/article/details/77620561
