不引入ES,如何利用 MySQL 實現(xiàn)模糊匹配
共 12287字,需瀏覽 25分鐘
·
2024-07-30 07:39
閱讀本文大概需要 4 分鐘。
來自:
1. 業(yè)務(wù)場景概述
2. 實現(xiàn)思路
? 分詞
? 與庫里已有數(shù)據(jù)進(jìn)行匹配
? 按照匹配度對結(jié)果進(jìn)行排序
3. 模糊匹配技術(shù)選型
? 方案一:引入ES
? 方案二:利用MySQL實現(xiàn)
? like匹配: 要求模式串與整個目標(biāo)字段完全匹配;
? RegExp正則匹配: 要求目標(biāo)字段包含模式串即可;
? Fulltext全文索引: 在字段類型為
CHAR、VARCHAR、TEXT的列上創(chuàng)建全文索引,執(zhí)行SQL進(jìn)行查詢。
? like匹配: 無法滿足需求,所以pass;
? 全文索引: 可定制性差,不支持任意匹配查詢,pass;
? 正則匹配: 可實現(xiàn)任意模式匹配,缺點在于執(zhí)行效率不如全文索引。
4. 實現(xiàn)效果展示
5. 核心代碼
5.1 提取公司關(guān)鍵信息
/**
* 匹配前去除公司名稱的無意義信息
* @param targetCompanyName
* @return
*/
privateStringformatCompanyName(String targetCompanyName){
Stringregex="(?<province>[^省]+自治區(qū)|.*?省|.*?行政區(qū)|.*?市)"+
"?(?<city>[^市]+自治州|.*?地區(qū)|.*?行政單位|.+盟|市轄區(qū)|.*?市|.*?縣)"+
"?(?<county>[^(區(qū)|市|縣|旗|島)]+區(qū)|.*?市|.*?縣|.*?旗|.*?島)"+
"?(?<village>.*)";
Matchermatcher=Pattern.compile(regex).matcher(targetCompanyName);
while(matcher.find()){
Stringprovince= matcher.group("province");
log.info("province:{}",province);
if(StringUtils.isNotBlank(province)&& targetCompanyName.contains(province)){
targetCompanyName = targetCompanyName.replace(province,"");
}
log.info("處理完省份的公司名稱:{}",targetCompanyName);
Stringcity= matcher.group("city");
log.info("city:{}",city);
if(StringUtils.isNotBlank(city)&& targetCompanyName.contains(city)){
targetCompanyName = targetCompanyName.replace(city,"");
}
log.info("處理完城市的公司名稱:{}",targetCompanyName);
Stringcounty= matcher.group("county");
log.info("county:{}",county);
if(StringUtils.isNotBlank(county)&& targetCompanyName.contains(county)){
targetCompanyName = targetCompanyName.replace(county,"");
}
log.info("處理完區(qū)縣級的公司名稱:{}",targetCompanyName);
}
String[][] address =AddressUtil.ADDRESS;
for(String[] city: address){
for(String b : city ){
if(targetCompanyName.contains(b)){
targetCompanyName = targetCompanyName.replace(b,"");
}
}
}
log.info("處理后的公司名稱:{}",targetCompanyName);
return targetCompanyName;
}
public classAddressUtil{
publicstaticfinalString[][] ADDRESS ={
{"北京"},
{"天津"},
{"安徽","安慶","蚌埠","亳州","巢湖","池州","滁州","阜陽","合肥","淮北","淮南","黃山","六安","馬鞍山","宿州","銅陵","蕪湖","宣城"},
{"澳門"},
{"香港"},
{"福建","福州","龍巖","南平","寧德","莆田","泉州","廈門","漳州"},
{"甘肅","白銀","定西","甘南藏族自治州","嘉峪關(guān)","金昌","酒泉","蘭州","臨夏回族自治州","隴南","平?jīng)?,"慶陽","天水","武威","張掖"},
{"廣東","潮州","東莞","佛山","廣州","河源","惠州","江門","揭陽","茂名","梅州","清遠(yuǎn)","汕頭","汕尾","韶關(guān)","深圳","陽江","云浮","湛江","肇慶","中山","珠海"},
{"廣西","百色","北海","崇左","防城港","貴港","桂林","河池","賀州","來賓","柳州","南寧","欽州","梧州","玉林"},
{"貴州","安順","畢節(jié)地區(qū)","貴陽","六盤水","黔東南苗族侗族自治州","黔南布依族苗族自治州","黔西南布依族苗族自治州","銅仁地區(qū)","遵義"},
{"海南","海口","三亞","直轄縣級行政區(qū)劃"},
{"河北","保定","滄州","承德","邯鄲","衡水","廊坊","秦皇島","石家莊","唐山","邢臺","張家口"},
{"河南","安陽","鶴壁","焦作","開封","洛陽","漯河","南陽","平頂山","濮陽","三門峽","商丘","新鄉(xiāng)","信陽","許昌","鄭州","周口","駐馬店"},
{"黑龍江","大慶","大興安嶺地區(qū)","哈爾濱","鶴崗","黑河","雞西","佳木斯","牡丹江","七臺河","齊齊哈爾","雙鴨山","綏化","伊春"},
{"湖北","鄂州","恩施土家族苗族自治州","黃岡","黃石","荊門","荊州","十堰","隨州","武漢","咸寧","襄樊","孝感","宜昌"},
{"湖南","長沙","常德","郴州","衡陽","懷化","婁底","邵陽","湘潭","湘西土家族苗族自治州","益陽","永州","岳陽","張家界","株洲"},
{"吉林","白城","白山","長春","吉林","遼源","四平","松原","通化","延邊朝鮮族自治州"},
{"江蘇","常州","淮安","連云港","南京","南通","蘇州","宿遷","泰州","無錫","徐州","鹽城","揚州","鎮(zhèn)江"},
{"江西","撫州","贛州","吉安","景德鎮(zhèn)","九江","南昌","萍鄉(xiāng)","上饒","新余","宜春","鷹潭"},
{"遼寧","鞍山","本溪","朝陽","大連","丹東","撫順","阜新","葫蘆島","錦州","遼陽","盤錦","沈陽","鐵嶺","營口"},
{"內(nèi)蒙古","阿拉善盟","巴彥淖爾","包頭","赤峰","鄂爾多斯","呼和浩特","呼倫貝爾","通遼","烏海","烏蘭察布","錫林郭勒盟","興安盟"},
{"寧夏回族","固原","石嘴山","吳忠","銀川","中衛(wèi)"},
{"青海","果洛藏族自治州","海北藏族自治州","海東地區(qū)","海南藏族自治州","海西蒙古族藏族自治州","黃南藏族自治州","西寧","玉樹藏族自治州"},
{"山東","濱州","德州","東營","菏澤","濟(jì)南","濟(jì)寧","萊蕪","聊城","臨沂","青島","日照","泰安","威海","濰坊","煙臺","棗莊","淄博"},
{"山西","長治","大同","晉城","晉中","臨汾","呂梁","朔州","太原","忻州","陽泉","運城"},
{"陜西","安康","寶雞","漢中","商洛","銅川","渭南","西安","咸陽","延安","榆林"},
{"上海"},
{"四川","阿壩藏族羌族自治州","巴中","成都","達(dá)州","德陽","甘孜藏族自治州","廣安","廣元","樂山","涼山彝族自治州","瀘州","眉山","綿陽","內(nèi)江","南充","攀枝花","遂寧","雅安","宜賓","資陽","自貢"},
{"西藏","阿里地區(qū)","昌都地區(qū)","拉薩","林芝地區(qū)","那曲地區(qū)","日喀則地區(qū)","山南地區(qū)"},
{"新疆維吾爾","阿克蘇地區(qū)","阿勒泰地區(qū)","巴音郭楞蒙古自治州","博爾塔拉蒙古自治州","昌吉回族自治州","哈密地區(qū)","和田地區(qū)","喀什地區(qū)","克拉瑪依","克孜勒蘇柯爾克孜自治州","塔城地區(qū)","吐魯番地區(qū)","烏魯木齊","伊犁哈薩克自治州","直轄縣級行政區(qū)劃"},
{"云南","保山","楚雄彝族自治州","大理白族自治州","德宏傣族景頗族自治州","迪慶藏族自治州","紅河哈尼族彝族自治州","昆明","麗江","臨滄","怒江僳僳族自治州","普洱","曲靖","文山壯族苗族自治州","西雙版納傣族自治州","玉溪","昭通"},
{"浙江","杭州","湖州","嘉興","金華","麗水","寧波","衢州","紹興","臺州","溫州","舟山"},
{"重慶"},
{"臺灣","臺北","高雄","基隆","臺中","臺南","新竹","嘉義"},
};
}
5.2 分詞相關(guān)代碼
<!-- ikAnalyzer 中文分詞器 -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
<exclusions>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- lucene-queryParser 查詢分析器模塊 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.3.0</version>
</dependency>
@Slf4j
publicclassIKAnalyzerSupport{
/**
* IK分詞
* @param target
* @return
*/
publicstaticList<String>iKSegmenterToList(String target)throwsException{
if(StringUtils.isEmpty(target)){
returnnewArrayList();
}
List<String> result =newArrayList<>();
StringReadersr=newStringReader(target);
// false:關(guān)閉智能分詞 (對分詞的精度影響較大)
IKSegmenterik=newIKSegmenter(sr,true);
Lexeme lex;
while((lex=ik.next())!=null){
StringlexemeText= lex.getLexemeText();
result.add(lexemeText);
}
return result;
}
}
/**
* 對目標(biāo)公司名稱進(jìn)行分詞
* @param targetCompanyName
* @return
*/
privateStringsplitWord(String targetCompanyName){
log.info("對處理后端公司名稱進(jìn)行分詞");
List<String> splitWord =newArrayList<>();
Stringresult= targetCompanyName;
try{
splitWord = iKSegmenterToList(targetCompanyName);
result = splitWord.stream().map(String::valueOf).distinct().collect(Collectors.joining("|"));
log.info("分詞結(jié)果:{}",result);
}catch(Exception e){
log.error("分詞報錯:{}",e.getMessage());
}
return result;
}
5.3 匹配
public JsonResultmatchCompanyName(CompanyDTO companyDTO, String accessToken, String localIp){
// 對公司名稱進(jìn)行處理
StringsourceCompanyName= companyDTO.getCompanyName();
StringtargetCompanyName= sourceCompanyName;
log.info("處理前公司名稱:{}",targetCompanyName);
// 處理圓括號
targetCompanyName = targetCompanyName.replaceAll("[(]|[)]|[(]|[)]","");
// 處理公司相關(guān)關(guān)鍵詞
targetCompanyName = targetCompanyName.replaceAll("[(集團(tuán)|股份|有限|責(zé)任|分公司)]","");
if(!targetCompanyName.contains("銀行")){
// 去除行政區(qū)域
targetCompanyName = formatCompanyName(targetCompanyName);
}
// 分詞
StringsplitCompanyName= splitWord(targetCompanyName);
// 匹配
List<Company> matchedCompany = companyRepository.queryMatchCompanyName(splitCompanyName,targetCompanyName);
List<String> result =newArrayList();
for(Company companyInfo : matchedCompany){
result.add(companyInfo.getCompanyName());
if(companyDTO.getCompanyId().equals(companyInfo.getCompanyId())){
result.remove(companyInfo.getCompanyName());
}
}
returnJsonResult.successResult(result);
}
/**
* 模糊匹配公司名稱
* @param companyNameRegex 分詞后的公司名稱
* @param companyName 分詞前的公司名稱
* @return
*/
@Query(value =
"SELECT * FROM company WHERE isDeleted = '0' and companyName REGEXP ?1
ORDER BY length(REPLACE(companyName,?2,''))/length(companyName) ",
nativeQuery = true)
List<Company> queryMatchCompanyName(String companyNameRegex,String companyName);
LENGTH(companyName)返回companyName的長度,LENGTH(REPLACE(companyName, ?2, ''))計算出companyName中關(guān)鍵詞出現(xiàn)的次數(shù)。通過這種方式,我們可以根據(jù)匹配程度進(jìn)行排序,匹配次數(shù)越多的公司名稱排序越靠前。
推薦閱讀:
程序員在線工具站:cxytools.com
推薦一個我自己寫的工具站:http://cxytools.com,專為程序員設(shè)計,包括時間日期、JSON處理、SQL格式化、隨機(jī)字符串生成、UUID生成、隨機(jī)數(shù)生成、文本Hash...等功能,提升開發(fā)效率。
?戳閱讀原文直達(dá)! 朕已閱 
評論
圖片
表情
