1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
| class RAGRetriever: """RAG检索器 - 结合向量搜索和关键词搜索"""
def __init__(self, vector_store: VectorStore, memory_manager: MemoryManager): self.vector_store = vector_store self.memory_manager = memory_manager self.bm25_scorer = BM25Scorer()
async def retrieve_relevant(self, query: str, top_k: int = 5, retrieval_mode: str = "hybrid") -> List[Dict]: """ 检索相关内容
retrieval_mode: - "vector": 仅向量搜索 - "keyword": 仅关键词搜索 - "hybrid": 混合搜索 """ results = []
if retrieval_mode in ["vector", "hybrid"]: query_embedding = await self.memory_manager._generate_embedding(query) vector_results = await self.vector_store.search( query_embedding, top_k=top_k * 2 if retrieval_mode == "hybrid" else top_k ) for memory in vector_results: results.append({ 'memory': memory, 'score': 0.8, 'source': 'vector' })
if retrieval_mode in ["keyword", "hybrid"]: all_memories = self.memory_manager.short_term_memory + \ self.memory_manager.long_term_memory
bm25_results = self.bm25_scorer.search(query, all_memories, top_k=top_k)
for score, memory in bm25_results: existing_ids = [r['memory'].id for r in results] if memory.id not in existing_ids: results.append({ 'memory': memory, 'score': score, 'source': 'keyword' })
if retrieval_mode == "hybrid": results = self._hybrid_fusion(results, top_k) else: results = sorted(results, key=lambda x: x['score'], reverse=True)[:top_k]
return results
def _hybrid_fusion(self, results: List[Dict], top_k: int) -> List[Dict]: """混合搜索结果融合""" k = 60
fused_scores: Dict[str, float] = {}
for result in results: memory_id = result['memory'].id source = result['source'] rank = results.index(result) + 1
if source == 'vector': base_score = 0.6 else: base_score = 0.4
rrf_score = base_score / (k + rank) fused_scores[memory_id] = fused_scores.get(memory_id, 0) + rrf_score
fused_results = [] for result in results: memory_id = result['memory'].id if memory_id in fused_scores: result['fused_score'] = fused_scores[memory_id] result['score'] = fused_scores[memory_id] fused_results.append(result)
return sorted(fused_results, key=lambda x: x['fused_score'], reverse=True)[:top_k]
class BM25Scorer: """BM25关键词评分器"""
def __init__(self, k1: float = 1.5, b: float = 0.75): self.k1 = k1 self.b = b self.corpus_size = 0 self.avgdl = 0 self.doc_freqs: Dict[str, int] = {} self.idf: Dict[str, float] = {}
def fit(self, documents: List[str]): """构建BM25索引""" tokenized_docs = [self._tokenize(doc) for doc in documents]
self.corpus_size = len(documents) nd = {}
for tokens in tokenized_docs: for token in set(tokens): nd[token] = nd.get(token, 0) + 1
for token, freq in nd.items(): self.idf[token] = math.log(self.corpus_size - freq + 0.5) - \ math.log(freq + 0.5)
self.avgdl = sum(len(tokens) for tokens in tokenized_docs) / self.corpus_size
def _tokenize(self, text: str) -> List[str]: """简单分词""" import re text = text.lower() tokens = re.findall(r'\b\w+\b', text) return tokens
def search(self, query: str, memories: List[Memory], top_k: int) -> List[Tuple[float, Memory]]: """搜索相关记忆""" if not self.idf: self.fit([m.content for m in memories])
query_tokens = self._tokenize(query) scores = []
for memory in memories: score = self._calc_bm25(memory.content, query_tokens) scores.append((score, memory))
return sorted(scores, key=lambda x: x[0], reverse=True)[:top_k]
def _calc_bm25(self, document: str, query_tokens: List[str]) -> float: """计算单文档的BM25分数""" doc_tokens = self._tokenize(document) doc_len = len(doc_tokens)
score = 0.0 for token in query_tokens: if token not in self.idf: continue
tf = doc_tokens.count(token) if tf == 0: continue
idf = self.idf[token] numerator = tf * (self.k1 + 1) denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
score += idf * numerator / denominator
return score
|