feat: init rag service

This commit is contained in:
djeon
2025-10-29 05:54:08 +09:00
parent 44ae9c546f
commit 5d897cb845
54 changed files with 6425 additions and 0 deletions
View File
Binary file not shown.
Binary file not shown.
+359
View File
@@ -0,0 +1,359 @@
"""
Azure AI Search 관련자료 DB
"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
SearchIndex,
SimpleField,
SearchableField,
SearchField,
VectorSearch,
HnswAlgorithmConfiguration,
VectorSearchProfile,
SemanticConfiguration,
SemanticField,
SemanticPrioritizedFields,
SemanticSearch,
SearchFieldDataType
)
from azure.search.documents.models import (
VectorizedQuery,
QueryType,
QueryCaptionType,
QueryAnswerType
)
from typing import List, Dict, Any, Optional
import logging
from ..models.document import DocumentChunk
logger = logging.getLogger(__name__)
class AzureAISearchDB:
"""Azure AI Search 관련자료 데이터베이스"""
def __init__(
self,
endpoint: str,
api_key: str,
index_name: str = "meeting-minutes-index",
api_version: str = "2023-11-01"
):
"""
초기화
Args:
endpoint: Azure AI Search 엔드포인트
api_key: API 키
index_name: 인덱스 이름
api_version: API 버전
"""
self.endpoint = endpoint
self.api_key = api_key
self.index_name = index_name
credential = AzureKeyCredential(api_key)
self.search_client = SearchClient(
endpoint=endpoint,
index_name=index_name,
credential=credential
)
self.index_client = SearchIndexClient(
endpoint=endpoint,
credential=credential
)
def create_index(self):
"""
인덱스 생성 (스키마 정의)
"""
# 필드 정의
fields = [
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
SimpleField(name="documentId", type=SearchFieldDataType.String, filterable=True),
SimpleField(name="documentType", type=SearchFieldDataType.String, filterable=True, facetable=True),
SearchableField(name="title", type=SearchFieldDataType.String, analyzer_name="ko.lucene"),
SimpleField(name="folder", type=SearchFieldDataType.String, filterable=True, facetable=True),
SimpleField(name="createdDate", type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True),
SearchField(
name="participants",
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
searchable=True,
filterable=True,
facetable=True
),
SearchField(
name="keywords",
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
searchable=True,
facetable=True
),
SimpleField(name="agendaId", type=SearchFieldDataType.String, filterable=True),
SearchableField(name="agendaTitle", type=SearchFieldDataType.String, analyzer_name="ko.lucene"),
SimpleField(name="chunkIndex", type=SearchFieldDataType.Int32, filterable=True, sortable=True),
SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="ko.lucene"),
SearchField(
name="contentVector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=1536,
vector_search_profile_name="meeting-vector-profile"
),
SimpleField(name="tokenCount", type=SearchFieldDataType.Int32, filterable=True)
]
# 벡터 검색 설정
vector_search = VectorSearch(
profiles=[
VectorSearchProfile(
name="meeting-vector-profile",
algorithm_configuration_name="meeting-hnsw"
)
],
algorithms=[
HnswAlgorithmConfiguration(
name="meeting-hnsw",
parameters={
"m": 4,
"efConstruction": 400,
"efSearch": 500,
"metric": "cosine"
}
)
]
)
# Semantic Search 설정
semantic_config = SemanticConfiguration(
name="meeting-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
title_field=SemanticField(field_name="title"),
content_fields=[SemanticField(field_name="content")],
keywords_fields=[SemanticField(field_name="keywords")]
)
)
semantic_search = SemanticSearch(
configurations=[semantic_config]
)
# 인덱스 생성
index = SearchIndex(
name=self.index_name,
fields=fields,
vector_search=vector_search,
semantic_search=semantic_search
)
try:
self.index_client.create_or_update_index(index)
logger.info(f"Azure AI Search 인덱스 생성 완료: {self.index_name}")
except Exception as e:
logger.error(f"인덱스 생성 실패: {str(e)}")
raise
def upload_documents(self, chunks: List[DocumentChunk]) -> bool:
"""
문서 업로드 (배치)
Args:
chunks: 문서 청크 리스트
Returns:
성공 여부
"""
if not chunks:
return True
try:
# Pydantic 모델을 딕셔너리로 변환
documents = [chunk.dict() for chunk in chunks]
# 배치 업로드 (최대 1000개씩)
batch_size = 1000
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
result = self.search_client.upload_documents(documents=batch)
logger.info(f"배치 {i//batch_size + 1}: {len(batch)}개 문서 업로드 완료")
return True
except Exception as e:
logger.error(f"문서 업로드 실패: {str(e)}")
return False
def hybrid_search(
self,
query: str,
query_embedding: List[float],
top_k: int = 3,
folder: Optional[str] = None,
document_type: Optional[str] = None,
semantic_ranking: bool = True
) -> List[Dict[str, Any]]:
"""
Hybrid Search (Keyword + Vector + Semantic Ranking)
Args:
query: 검색 쿼리
query_embedding: 쿼리 임베딩 벡터
top_k: 반환할 최대 결과 수
folder: 폴더 필터
document_type: 문서 타입 필터
semantic_ranking: Semantic Ranking 사용 여부
Returns:
검색 결과 리스트
"""
try:
# Vector Query
vector_query = VectorizedQuery(
vector=query_embedding,
k_nearest_neighbors=50,
fields="contentVector"
)
# 필터 생성
filter_parts = []
if folder:
filter_parts.append(f"folder eq '{folder}'")
if document_type:
filter_parts.append(f"documentType eq '{document_type}'")
filter_expression = " and ".join(filter_parts) if filter_parts else None
# 검색 옵션 설정
search_params = {
"search_text": query,
"vector_queries": [vector_query],
"select": ["documentId", "title", "createdDate", "content", "agendaTitle", "folder"],
"top": 50 if semantic_ranking else top_k,
"filter": filter_expression
}
# Semantic Ranking 활성화
if semantic_ranking:
search_params.update({
"query_type": QueryType.SEMANTIC,
"semantic_configuration_name": "meeting-semantic-config",
"query_caption": QueryCaptionType.EXTRACTIVE,
"query_answer": QueryAnswerType.EXTRACTIVE
})
# 검색 실행
results = self.search_client.search(**search_params)
# 결과 처리
search_results = []
for i, result in enumerate(results):
if i >= top_k:
break
# Reranking Score (Semantic Ranking 또는 BM25 Score)
score = result.get("@search.reranker_score", result.get("@search.score", 0.0))
# 관련도 레벨 결정
if score >= 3.0: # Semantic Ranking 점수 기준
relevance_level = "HIGH"
elif score >= 2.0:
relevance_level = "MEDIUM"
else:
relevance_level = "LOW"
# Caption 추출 (Semantic Captions)
captions = result.get("@search.captions", [])
excerpt = captions[0].text if captions else result["content"][:300]
search_results.append({
"document_id": result["documentId"],
"title": result["title"],
"document_type": result.get("documentType", "unknown"),
"created_date": result.get("createdDate"),
"relevance_score": min(score / 4.0, 1.0), # 0~1 정규화
"relevance_level": relevance_level,
"content_excerpt": excerpt,
"folder": result.get("folder")
})
return search_results
except Exception as e:
logger.error(f"Hybrid Search 실패: {str(e)}")
return []
def delete_documents_by_id(self, document_id: str) -> bool:
"""
문서 ID로 모든 청크 삭제
Args:
document_id: 문서 ID
Returns:
성공 여부
"""
try:
# 해당 문서의 모든 청크 조회
results = self.search_client.search(
search_text="*",
filter=f"documentId eq '{document_id}'",
select=["id"]
)
# 청크 ID 수집
chunk_ids = [{"id": result["id"]} for result in results]
if chunk_ids:
# 배치 삭제
self.search_client.delete_documents(documents=chunk_ids)
logger.info(f"문서 {document_id}{len(chunk_ids)}개 청크 삭제 완료")
return True
except Exception as e:
logger.error(f"문서 삭제 실패 ({document_id}): {str(e)}")
return False
def get_stats(self) -> Dict[str, Any]:
"""
인덱스 통계 조회
Returns:
통계 정보
"""
try:
# 전체 문서 수 (중복 제거)
results = self.search_client.search(
search_text="*",
select=["documentId", "documentType"],
top=10000
)
document_ids = set()
type_counts = {}
for result in results:
doc_id = result.get("documentId")
doc_type = result.get("documentType", "unknown")
if doc_id:
document_ids.add(doc_id)
type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
return {
"total_documents": len(document_ids),
"total_chunks": sum(type_counts.values()),
"by_type": type_counts
}
except Exception as e:
logger.error(f"통계 조회 실패: {str(e)}")
return {
"total_documents": 0,
"total_chunks": 0,
"by_type": {}
}
+381
View File
@@ -0,0 +1,381 @@
"""
PostgreSQL + pgvector 용어집 DB
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from typing import List, Optional, Dict, Any
from contextlib import contextmanager
import logging
import json
from ..models.term import Term
from ..utils.embedding import cosine_similarity
logger = logging.getLogger(__name__)
class PostgresVectorDB:
"""PostgreSQL + pgvector 용어집 데이터베이스"""
def __init__(self, connection_string: str):
"""
초기화
Args:
connection_string: PostgreSQL 연결 문자열
"""
self.connection_string = connection_string
@contextmanager
def get_connection(self):
"""데이터베이스 연결 컨텍스트 매니저"""
conn = psycopg2.connect(self.connection_string)
try:
yield conn
finally:
conn.close()
@staticmethod
def _parse_embedding(embedding_str: Optional[str]) -> Optional[List[float]]:
"""
PostgreSQL vector 타입을 Python 리스트로 변환
Args:
embedding_str: PostgreSQL에서 반환된 vector 문자열 (예: "[-0.003,0.01,...]")
Returns:
float 리스트 또는 None
"""
if not embedding_str:
return None
try:
# vector 타입은 "[1,2,3]" 형태의 문자열로 반환됨
if isinstance(embedding_str, str):
return json.loads(embedding_str)
elif isinstance(embedding_str, list):
return embedding_str
return None
except (json.JSONDecodeError, ValueError) as e:
logger.error(f"임베딩 파싱 실패: {str(e)}")
return None
@staticmethod
def _row_to_term(row: Dict[str, Any]) -> Term:
"""
데이터베이스 row를 Term 객체로 변환
Args:
row: 데이터베이스 row (dict)
Returns:
Term 객체
"""
term_dict = dict(row)
# embedding 필드 파싱
if "embedding" in term_dict:
term_dict["embedding"] = PostgresVectorDB._parse_embedding(term_dict["embedding"])
term_dict.pop('embedding')
return Term(**term_dict)
def init_database(self):
"""
데이터베이스 초기화 (테이블 및 인덱스 생성)
"""
with self.get_connection() as conn:
with conn.cursor() as cur:
# pgvector 확장 설치
cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
# terms 테이블 생성
cur.execute("""
CREATE TABLE IF NOT EXISTS terms (
term_id VARCHAR(100) PRIMARY KEY,
term_name VARCHAR(200) NOT NULL,
normalized_name VARCHAR(200) NOT NULL,
category VARCHAR(100),
definition TEXT NOT NULL,
context TEXT,
synonyms JSONB DEFAULT '[]',
related_terms JSONB DEFAULT '[]',
document_source JSONB,
confidence_score DECIMAL(3,2) DEFAULT 0.0,
usage_count INTEGER DEFAULT 0,
last_updated VARCHAR(50),
embedding vector(1536),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 인덱스 생성
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_terms_normalized_name
ON terms(normalized_name)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_terms_category
ON terms(category)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_terms_confidence
ON terms(confidence_score DESC)
""")
# 벡터 유사도 검색용 인덱스 (IVFFlat)
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_terms_embedding
ON terms USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100)
""")
# term_usage_logs 테이블 (사용 이력)
cur.execute("""
CREATE TABLE IF NOT EXISTS term_usage_logs (
log_id SERIAL PRIMARY KEY,
term_id VARCHAR(100) REFERENCES terms(term_id) ON DELETE CASCADE,
user_id VARCHAR(100),
meeting_id VARCHAR(100),
action VARCHAR(20),
feedback_rating INTEGER,
feedback_comment TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_usage_term_id
ON term_usage_logs(term_id, created_at DESC)
""")
conn.commit()
logger.info("PostgreSQL 데이터베이스 초기화 완료")
def insert_term(self, term: Term) -> bool:
"""
용어 삽입
Args:
term: 용어 객체
Returns:
성공 여부
"""
try:
with self.get_connection() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO terms (
term_id, term_name, normalized_name, category,
definition, context, synonyms, related_terms,
document_source, confidence_score, usage_count,
last_updated, embedding
) VALUES (
%s, %s, %s, %s, %s, %s, %s::jsonb, %s::jsonb,
%s::jsonb, %s, %s, %s, %s::vector
)
ON CONFLICT (term_id) DO UPDATE SET
term_name = EXCLUDED.term_name,
normalized_name = EXCLUDED.normalized_name,
category = EXCLUDED.category,
definition = EXCLUDED.definition,
context = EXCLUDED.context,
synonyms = EXCLUDED.synonyms,
related_terms = EXCLUDED.related_terms,
document_source = EXCLUDED.document_source,
confidence_score = EXCLUDED.confidence_score,
usage_count = EXCLUDED.usage_count,
last_updated = EXCLUDED.last_updated,
embedding = EXCLUDED.embedding,
updated_at = CURRENT_TIMESTAMP
""", (
term.term_id,
term.term_name,
term.normalized_name,
term.category,
term.definition,
term.context,
psycopg2.extras.Json(term.synonyms),
psycopg2.extras.Json(term.related_terms),
psycopg2.extras.Json(term.document_source.dict() if term.document_source else None),
term.confidence_score,
term.usage_count,
term.last_updated,
term.embedding
))
conn.commit()
return True
except Exception as e:
logger.error(f"용어 삽입 실패 ({term.term_id}): {str(e)}")
return False
def get_term_by_id(self, term_id: str) -> Optional[Term]:
"""
ID로 용어 조회
Args:
term_id: 용어 ID
Returns:
용어 객체 또는 None
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM terms WHERE term_id = %s
""", (term_id,))
row = cur.fetchone()
if row:
return self._row_to_term(row)
return None
def search_by_keyword(
self,
query: str,
top_k: int = 5,
confidence_threshold: float = 0.7
) -> List[Dict[str, Any]]:
"""
키워드 검색
Args:
query: 검색 쿼리
top_k: 반환할 최대 결과 수
confidence_threshold: 최소 신뢰도 임계값
Returns:
검색 결과 리스트
"""
normalized_query = query.lower().strip()
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT *,
CASE
WHEN normalized_name = %s THEN 1.0
WHEN normalized_name LIKE %s THEN 0.9
WHEN term_name ILIKE %s THEN 0.8
WHEN synonyms::text ILIKE %s THEN 0.7
ELSE 0.5
END as match_score
FROM terms
WHERE (
normalized_name LIKE %s
OR term_name ILIKE %s
OR synonyms::text ILIKE %s
OR definition ILIKE %s
)
AND confidence_score >= %s
ORDER BY match_score DESC, confidence_score DESC, usage_count DESC
LIMIT %s
""", (
normalized_query,
f"%{normalized_query}%",
f"%{query}%",
f"%{query}%",
f"%{normalized_query}%",
f"%{query}%",
f"%{query}%",
f"%{query}%",
confidence_threshold,
top_k
))
results = []
for row in cur.fetchall():
term_dict = dict(row)
match_score = term_dict.pop("match_score")
results.append({
"term": self._row_to_term(term_dict),
"relevance_score": float(match_score),
"match_type": "keyword"
})
return results
def search_by_vector(
self,
query_embedding: List[float],
top_k: int = 5,
confidence_threshold: float = 0.7
) -> List[Dict[str, Any]]:
"""
벡터 유사도 검색
Args:
query_embedding: 쿼리 임베딩 벡터
top_k: 반환할 최대 결과 수
confidence_threshold: 최소 신뢰도 임계값
Returns:
검색 결과 리스트
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT *,
1 - (embedding <=> %s::vector) as similarity_score
FROM terms
WHERE confidence_score >= %s
AND embedding IS NOT NULL
ORDER BY embedding <=> %s::vector
LIMIT %s
""", (
query_embedding,
confidence_threshold,
query_embedding,
top_k
))
results = []
for row in cur.fetchall():
term_dict = dict(row)
similarity_score = term_dict.pop("similarity_score")
results.append({
"term": self._row_to_term(term_dict),
"relevance_score": float(similarity_score),
"match_type": "vector"
})
return results
def get_stats(self) -> Dict[str, Any]:
"""
용어 통계 조회
Returns:
통계 정보
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# 전체 통계
cur.execute("""
SELECT
COUNT(*) as total_terms,
AVG(confidence_score) as avg_confidence
FROM terms
""")
overall = cur.fetchone()
# 카테고리별 통계
cur.execute("""
SELECT category, COUNT(*) as count
FROM terms
GROUP BY category
ORDER BY count DESC
""")
by_category = {row["category"]: row["count"] for row in cur.fetchall()}
return {
"total_terms": overall["total_terms"],
"avg_confidence": float(overall["avg_confidence"]) if overall["avg_confidence"] else 0.0,
"by_category": by_category
}
+338
View File
@@ -0,0 +1,338 @@
"""
RAG 회의록 데이터베이스
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from typing import List, Optional, Dict, Any
from contextlib import contextmanager
import logging
import json
from datetime import datetime
from ..models.minutes import RagMinutes, MinutesSection
from ..utils.embedding import cosine_similarity
logger = logging.getLogger(__name__)
class RagMinutesDB:
"""RAG 회의록 PostgreSQL + pgvector 데이터베이스"""
def __init__(self, connection_string: str):
"""
초기화
Args:
connection_string: PostgreSQL 연결 문자열
"""
self.connection_string = connection_string
@contextmanager
def get_connection(self):
"""데이터베이스 연결 컨텍스트 매니저"""
conn = psycopg2.connect(self.connection_string)
try:
yield conn
finally:
conn.close()
@staticmethod
def _parse_embedding(embedding_str: Optional[str]) -> Optional[List[float]]:
"""
PostgreSQL vector 타입을 Python 리스트로 변환
Args:
embedding_str: PostgreSQL에서 반환된 vector 문자열 (예: "[-0.003,0.01,...]")
Returns:
float 리스트 또는 None
"""
if not embedding_str:
return None
try:
# vector 타입은 "[1,2,3]" 형태의 문자열로 반환됨
if isinstance(embedding_str, str):
return json.loads(embedding_str)
elif isinstance(embedding_str, list):
return embedding_str
return None
except (json.JSONDecodeError, ValueError) as e:
logger.error(f"임베딩 파싱 실패: {str(e)}")
return None
@staticmethod
def _row_to_minutes(row: Dict[str, Any]) -> RagMinutes:
"""
데이터베이스 row를 RagMinutes 객체로 변환
Args:
row: 데이터베이스 row (dict)
Returns:
RagMinutes 객체
"""
minutes_dict = dict(row)
# embedding 필드 파싱
if "embedding" in minutes_dict:
minutes_dict["embedding"] = RagMinutesDB._parse_embedding(minutes_dict["embedding"])
# sections 필드 파싱
if "sections" in minutes_dict and minutes_dict["sections"]:
sections_data = minutes_dict["sections"]
if isinstance(sections_data, str):
sections_data = json.loads(sections_data)
minutes_dict["sections"] = [MinutesSection(**section) for section in sections_data]
else:
minutes_dict["sections"] = []
# datetime 필드를 문자열로 변환
for field in ['scheduled_at', 'finalized_at', 'created_at', 'updated_at']:
if field in minutes_dict and minutes_dict[field]:
if isinstance(minutes_dict[field], datetime):
minutes_dict[field] = minutes_dict[field].isoformat()
return RagMinutes(**minutes_dict)
def insert_minutes(self, minutes: RagMinutes) -> bool:
"""
회의록 삽입 또는 업데이트
Args:
minutes: 회의록 객체
Returns:
성공 여부
"""
try:
with self.get_connection() as conn:
with conn.cursor() as cur:
# sections를 JSON으로 변환
sections_json = [section.dict() for section in minutes.sections]
cur.execute("""
INSERT INTO rag_minutes (
meeting_id, title, purpose, description, scheduled_at,
location, organizer_id, minutes_id, minutes_status,
minutes_version, created_by, finalized_by, finalized_at,
sections, full_content, embedding
) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s::jsonb, %s, %s::vector
)
ON CONFLICT (minutes_id) DO UPDATE SET
meeting_id = EXCLUDED.meeting_id,
title = EXCLUDED.title,
purpose = EXCLUDED.purpose,
description = EXCLUDED.description,
scheduled_at = EXCLUDED.scheduled_at,
location = EXCLUDED.location,
organizer_id = EXCLUDED.organizer_id,
minutes_status = EXCLUDED.minutes_status,
minutes_version = EXCLUDED.minutes_version,
finalized_by = EXCLUDED.finalized_by,
finalized_at = EXCLUDED.finalized_at,
sections = EXCLUDED.sections,
full_content = EXCLUDED.full_content,
embedding = EXCLUDED.embedding,
updated_at = CURRENT_TIMESTAMP
""", (
minutes.meeting_id,
minutes.title,
minutes.purpose,
minutes.description,
minutes.scheduled_at,
minutes.location,
minutes.organizer_id,
minutes.minutes_id,
minutes.minutes_status,
minutes.minutes_version,
minutes.created_by,
minutes.finalized_by,
minutes.finalized_at,
psycopg2.extras.Json(sections_json),
minutes.full_content,
minutes.embedding
))
conn.commit()
logger.info(f"회의록 저장 성공: {minutes.minutes_id}")
return True
except Exception as e:
logger.error(f"회의록 저장 실패 ({minutes.minutes_id}): {str(e)}")
return False
def get_minutes_by_id(self, minutes_id: str) -> Optional[RagMinutes]:
"""
ID로 회의록 조회
Args:
minutes_id: 회의록 ID
Returns:
회의록 객체 또는 None
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM rag_minutes WHERE minutes_id = %s
""", (minutes_id,))
row = cur.fetchone()
if row:
return self._row_to_minutes(row)
return None
def search_by_vector(
self,
query_embedding: List[float],
top_k: int = 5,
similarity_threshold: float = 0.7
) -> List[Dict[str, Any]]:
"""
벡터 유사도 검색
Args:
query_embedding: 쿼리 임베딩 벡터
top_k: 반환할 최대 결과 수
similarity_threshold: 최소 유사도 임계값
Returns:
검색 결과 리스트
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT *,
1 - (embedding <=> %s::vector) as similarity_score
FROM rag_minutes
WHERE embedding IS NOT NULL
AND 1 - (embedding <=> %s::vector) >= %s
ORDER BY embedding <=> %s::vector
LIMIT %s
""", (
query_embedding,
query_embedding,
similarity_threshold,
query_embedding,
top_k
))
results = []
for row in cur.fetchall():
minutes_dict = dict(row)
similarity_score = minutes_dict.pop("similarity_score")
results.append({
"minutes": self._row_to_minutes(minutes_dict),
"similarity_score": float(similarity_score)
})
logger.info(f"벡터 검색 완료: {len(results)}개 결과")
return results
def search_by_keyword(
self,
query: str,
top_k: int = 5
) -> List[Dict[str, Any]]:
"""
키워드 검색
Args:
query: 검색 쿼리
top_k: 반환할 최대 결과 수
Returns:
검색 결과 리스트
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT *,
ts_rank(to_tsvector('simple', full_content), plainto_tsquery('simple', %s)) as rank_score
FROM rag_minutes
WHERE to_tsvector('simple', full_content) @@ plainto_tsquery('simple', %s)
OR title ILIKE %s
ORDER BY rank_score DESC, finalized_at DESC
LIMIT %s
""", (
query,
query,
f"%{query}%",
top_k
))
results = []
for row in cur.fetchall():
minutes_dict = dict(row)
rank_score = minutes_dict.pop("rank_score", 0.0)
results.append({
"minutes": self._row_to_minutes(minutes_dict),
"similarity_score": float(rank_score) if rank_score else 0.0
})
logger.info(f"키워드 검색 완료: {len(results)}개 결과")
return results
def get_stats(self) -> Dict[str, Any]:
"""
통계 조회
Returns:
통계 정보
"""
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# 전체 통계
cur.execute("""
SELECT
COUNT(*) as total_minutes,
COUNT(DISTINCT meeting_id) as total_meetings,
COUNT(DISTINCT created_by) as total_authors
FROM rag_minutes
""")
overall = cur.fetchone()
# 최근 회의록
cur.execute("""
SELECT finalized_at
FROM rag_minutes
WHERE finalized_at IS NOT NULL
ORDER BY finalized_at DESC
LIMIT 1
""")
latest = cur.fetchone()
return {
"total_minutes": overall["total_minutes"],
"total_meetings": overall["total_meetings"],
"total_authors": overall["total_authors"],
"latest_finalized_at": latest["finalized_at"].isoformat() if latest and latest["finalized_at"] else None
}
def delete_minutes(self, minutes_id: str) -> bool:
"""
회의록 삭제
Args:
minutes_id: 회의록 ID
Returns:
성공 여부
"""
try:
with self.get_connection() as conn:
with conn.cursor() as cur:
cur.execute("""
DELETE FROM rag_minutes WHERE minutes_id = %s
""", (minutes_id,))
conn.commit()
logger.info(f"회의록 삭제 성공: {minutes_id}")
return True
except Exception as e:
logger.error(f"회의록 삭제 실패 ({minutes_id}): {str(e)}")
return False