RapidFuzz , 一個神奇的 Python 庫
RapidFuzz 是一個高性能的字符串匹配庫,用于計算字符串之間的相似度。它是 FuzzyWuzzy 的更快替代品,使用 C++ 實現(xiàn)并提供 Python 接口。

官方資料
- 文檔:https://rapidfuzz.github.io/RapidFuzz/
- GitHub:https://github.com/rapidfuzz/RapidFuzz
安裝
pip install rapidfuzz基本功能
(1) 簡單比率計算
計算兩個字符串的簡單相似度比率:
from rapidfuzz import fuzz
ratio = fuzz.ratio("hello world", "hello python")
print(ratio)
# 輸出相似度百分比,例如 53.85(2) 部分比率
部分字符串相似度(一個字符串是另一個的子集時特別有用):
from rapidfuzz import fuzz
partial_ratio = fuzz.partial_ratio("hello world", "world")
print(partial_ratio)
# 輸出 100,因為"world"完全匹配(3) 令牌排序比率
忽略單詞順序計算相似度:
from rapidfuzz import fuzz
token_sort_ratio = fuzz.token_sort_ratio("hello world", "world hello")
print(token_sort_ratio)
# 輸出 100,因為單詞相同只是順序不同(4) 令牌集比率
更靈活的相似度計算,考慮共同和獨特的令牌:
from rapidfuzz import fuzz
token_set_ratio = fuzz.token_set_ratio("hello world", "hello python world")
print(token_set_ratio)
# 輸出較高的相似度,因為有共同單詞高級功能
(1) 處理多個字符串
找出與查詢最匹配的字符串,獲取單個或多個匹配內容。
from rapidfuzz import process
choices = ["hello world", "hello python", "hello there", "world peace"]
best_match = process.extractOne("hello", choices)
print(best_match)
# 輸出: ('hello world', 90.0)
# 獲取前3個匹配
top_3 = process.extract("hello", choices, limit=3)
print(top_3)
# 輸出: [('hello world', 90.0), ('hello python', 90.0), ('hello there', 90.0)](2) 使用不同評分器
使用不同的相似度計算方法:
from rapidfuzz import process
choices = ["hello world", "hello python", "hello there", "world peace"]
best_partial=process.extractOne("world",choices,scorer=fuzz.partial_ratio)
print(best_partial)
# 輸出: ('world peace', 100.0)
best_token=process.extractOne("pythonhello",choices,scorer=fuzz.token_sort_ratio)
print(best_token)
# 輸出: ('hello python', 100.0)(3) 自定義權重
可以組合多個評分方法:
def custom_scorer(s1, s2, processor=None, score_cutoff=None, **kwargs):
if processor:
s1 = processor(s1)
s2 = processor(s2)
score = (fuzz.ratio(s1, s2) + fuzz.token_sort_ratio(s1, s2)) / 2
if score_cutoff isnotNoneand score < score_cutoff:
return0
return score
choices = ["hello world", "hello python", "hello there", "world peace"]
best_custom = process.extractOne("python hello", choices, scorer=custom_scorer)
print(best_custom)(4) 處理大型數(shù)據(jù)集
對于大型數(shù)據(jù)集,可以使用更高效的提取方法,可以使用更快的提取方法(score_cutoff設置最低分數(shù)閾值)
large_choices = [...]
results=process.extract("query",large_choices,corer=fuzz.WRatio, score_cutoff=80)應用場景
(1) 數(shù)據(jù)清洗
import pandas as pd
from rapidfuzz import process, fuzz
df = pd.DataFrame({
"name": ["John Doe", "Jon D.", "Jane Smith", "jane smith", "Robert Johnson"]
})
# 標準化姓名
unique_names = df['name'].unique()
standardized = {}
for name in df['name']:
if name notin standardized:
match, score, _ = process.extractOne(name, unique_names, scorer=fuzz.WRatio)
if score > 85: # 相似度閾值
standardized[name] = match
else:
standardized[name] = name
df['standardized_name'] = df['name'].map(standardized)
print(df)(2) 搜索查找
from rapidfuzz import process
products = ["iPhone 12", "iPhone 12 Pro", "Samsung Galaxy S21", "iPad Pro"]
def search_suggestions(query, items, limit=3):
results = process.extract(query, items, limit=limit)
return [item[0] for item in results]
print(search_suggestions("iphone", products)) # 輸出: ['iPhone 12', 'iPhone 12 Pro']實戰(zhàn)
公司房源數(shù)據(jù)綁定通通鎖,有一個房源表,表中含有l(wèi)ock_id字段為通通鎖ID。維修人員安裝鎖并錄入鎖數(shù)據(jù),往往錄入的鎖名稱和房源名稱 完全一致。 這給技術人員通過sql語句的=或like都不能完全實現(xiàn)。
(1) 思路
- 房源數(shù)據(jù)在數(shù)據(jù)庫表中,包含 lock_id 字段
- 智能鎖數(shù)據(jù)是外部的 JSON 集合
- 需要將 JSON 中的智能鎖通過名稱模糊匹配綁定到房源
鎖數(shù)據(jù)如下:
smart_locks_json = [
{"id":"lock001", "name": "301室", "mac": "00:1A:2B:3C:4D:5E"},
{"id":"lock002", "name": "302-A", "mac": "00:1A:2B:3C:4D:5F"},
{"id":"lock003", "name": "三樓303", "mac": "00:1A:2B:3C:4D:60"},
]數(shù)據(jù)表ORM:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class House(Base):
__tablename__ = 'houses'
id = Column(Integer, primary_key=True)
name = Column(String) # 房源名稱,如 "3樓301號"
address = Column(String)
lock_id = Column(String) # 存儲智能鎖ID,注意類型可能與JSON中的鎖ID一致
engine = create_engine('sqlite:///house.db')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()實現(xiàn)代碼如下:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class House(Base):
__tablename__ = 'houses'
id = Column(Integer, primary_key=True)
name = Column(String) # 房源名稱,如 "3樓301號"
address = Column(String)
lock_id = Column(String) # 存儲智能鎖ID,注意類型可能與JSON中的鎖ID一致
engine = create_engine('sqlite:///house.db')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()
實現(xiàn)代碼如下:
from rapidfuzz import process, fuzz, utils
from typing import List, Dict, Tuple
def match_locks_to_properties(
locks_data: List[Dict],
threshold: int = 80,
limit_per_lock: int = 3
) -> List[Tuple[Dict, House, float]]:
"""
將JSON中的智能鎖匹配到房源
:param locks_data: 智能鎖JSON數(shù)據(jù)
:param threshold: 最低匹配分數(shù)閾值
:param limit_per_lock: 每個鎖返回的最大匹配項數(shù)
:return: 返回匹配結果列表,每個元素是(鎖字典, 房源對象, 匹配分數(shù))的元組
"""
# 獲取所有未綁定鎖的房源
unassigned_properties=db.query(House).filter(House.lock_id.is_(None)).all()
ifnot locks_data ornot unassigned_properties:
return []
# 準備數(shù)據(jù)用于匹配
property_names = [prop.name for prop in unassigned_properties]
property_map = {prop.name: prop for prop in unassigned_properties}
# 存儲匹配結果
matches = []
for lock in locks_data:
lock_name = lock.get("name", "")
ifnot lock_name:
continue
# 使用 RapidFuzz 找到最佳匹配
results = process.extract(
lock_name,
property_names,
scorer=fuzz.WRatio, # 使用加權比率
limit=limit_per_lock,
score_cutoff=threshold
)
for prop_name, score, _ in results:
prop = property_map[prop_name]
matches.append((lock, prop, score))
return matches
def apply_matches(matches: List[Tuple[Dict, House, float]]):
"""
應用匹配結果到數(shù)據(jù)庫
"""
for lock, prop, score in matches:
print(f"匹配: 鎖 '{lock['name']}' (ID: {lock['id']}) -> 房源 '{prop.name}' (分數(shù): {score:.2f})")
prop.lock_id = lock['id']
db.commit()


































