feat: init rag service

2026-07-28 09:09:10 +00:00 · 2025-10-29 05:54:08 +09:00
parent 44ae9c546f
commit 5d897cb845
54 changed files with 6425 additions and 0 deletions
@@ -0,0 +1,180 @@
+"""
+FastAPI 엔드포인트 테스트
+"""
+import pytest
+from fastapi.testclient import TestClient
+from pathlib import Path
+import sys
+
+# 프로젝트 루트 디렉토리를 Python 경로에 추가
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from src.api.main import app
+
+client = TestClient(app)
+
+
+def test_root():
+    """루트 엔드포인트 테스트"""
+    response = client.get("/")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["service"] == "Vector DB 통합 시스템"
+    assert data["version"] == "1.0.0"
+
+
+def test_search_terms_keyword():
+    """용어 키워드 검색 테스트"""
+    response = client.post(
+        "/api/terms/search",
+        json={
+            "query": "API",
+            "search_type": "keyword",
+            "top_k": 5,
+            "confidence_threshold": 0.7
+        }
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert isinstance(results, list)
+
+    if len(results) > 0:
+        result = results[0]
+        assert "term" in result
+        assert "relevance_score" in result
+        assert "match_type" in result
+
+
+def test_search_terms_vector():
+    """용어 벡터 검색 테스트"""
+    response = client.post(
+        "/api/terms/search",
+        json={
+            "query": "회의 일정 관리",
+            "search_type": "vector",
+            "top_k": 3,
+            "confidence_threshold": 0.6
+        }
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert isinstance(results, list)
+
+
+def test_search_terms_hybrid():
+    """용어 하이브리드 검색 테스트"""
+    response = client.post(
+        "/api/terms/search",
+        json={
+            "query": "마이크로서비스",
+            "search_type": "hybrid",
+            "top_k": 5,
+            "confidence_threshold": 0.5
+        }
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert isinstance(results, list)
+
+
+def test_get_term_stats():
+    """용어 통계 조회 테스트"""
+    response = client.get("/api/terms/stats")
+    assert response.status_code == 200
+    stats = response.json()
+    assert "total_terms" in stats
+    assert "by_category" in stats
+    assert "avg_confidence" in stats
+
+
+def test_search_documents():
+    """관련 문서 검색 테스트"""
+    response = client.post(
+        "/api/documents/search",
+        json={
+            "query": "프로젝트 계획",
+            "top_k": 3,
+            "relevance_threshold": 0.3,
+            "semantic_ranking": True
+        }
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert isinstance(results, list)
+
+    if len(results) > 0:
+        result = results[0]
+        assert "document_id" in result
+        assert "title" in result
+        assert "content" in result
+        assert "relevance_score" in result
+
+
+def test_search_documents_with_filters():
+    """필터링된 문서 검색 테스트"""
+    response = client.post(
+        "/api/documents/search",
+        json={
+            "query": "회의록",
+            "top_k": 5,
+            "relevance_threshold": 0.3,
+            "document_type": "회의록",
+            "semantic_ranking": True
+        }
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert isinstance(results, list)
+
+
+def test_get_document_stats():
+    """문서 통계 조회 테스트"""
+    response = client.get("/api/documents/stats")
+    assert response.status_code == 200
+    stats = response.json()
+    assert "total_documents" in stats
+    assert "by_type" in stats
+    assert "total_chunks" in stats
+
+
+def test_get_nonexistent_term():
+    """존재하지 않는 용어 조회 테스트"""
+    response = client.get("/api/terms/nonexistent-term-id")
+    assert response.status_code == 404
+
+
+def test_explain_term():
+    """용어 설명 생성 테스트 (Claude AI)"""
+    # 먼저 용어 검색
+    search_response = client.post(
+        "/api/terms/search",
+        json={
+            "query": "API",
+            "search_type": "keyword",
+            "top_k": 1
+        }
+    )
+
+    if search_response.status_code == 200:
+        results = search_response.json()
+        if len(results) > 0:
+            term_id = results[0]["term"]["term_id"]
+
+            # 용어 설명 생성
+            explain_response = client.post(
+                f"/api/terms/{term_id}/explain",
+                json={
+                    "meeting_context": "백엔드 개발 회의에서 REST API 설계 논의"
+                }
+            )
+
+            assert explain_response.status_code == 200
+            explanation = explain_response.json()
+            assert "term" in explanation
+            assert "explanation" in explanation
+            assert "generated_by" in explanation
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
@@ -0,0 +1,234 @@
+"""
+데이터 로딩 및 임베딩 생성 테스트
+"""
+import sys
+from pathlib import Path
+import json
+
+# 프로젝트 루트 디렉토리를 Python 경로에 추가
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from src.models.term import Term, DocumentSource
+from src.models.document import Document, DocumentMetadata
+from src.utils.config import load_config
+from src.utils.embedding import EmbeddingGenerator
+
+
+def test_load_config():
+    """설정 로드 테스트"""
+    print("=" * 60)
+    print("설정 로드 테스트")
+    print("=" * 60)
+
+    config = load_config(str(project_root / "config.yaml"))
+    assert config is not None
+    assert "postgres" in config
+    assert "azure_openai" in config
+    assert "azure_search" in config
+    assert "claude" in config
+
+    print("✓ 설정 로드 성공")
+    print(f"  - PostgreSQL 호스트: {config['postgres']['host']}")
+    print(f"  - Azure OpenAI 모델: {config['azure_openai']['embedding_model']}")
+    print(f"  - Azure Search 인덱스: {config['azure_search']['index_name']}")
+    print(f"  - Claude 모델: {config['claude']['model']}")
+    print()
+
+
+def test_load_term_data():
+    """용어 데이터 로드 테스트"""
+    print("=" * 60)
+    print("용어 데이터 로드 테스트")
+    print("=" * 60)
+
+    data_dir = project_root.parent / "design/aidata"
+    terms_files = ["terms-01.json", "terms-02.json", "terms-03.json", "terms-04.json"]
+
+    all_terms = []
+    for filename in terms_files:
+        file_path = data_dir / filename
+        if file_path.exists():
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            for domain_data in data.get("terms", []):
+                for term_data in domain_data.get("data", []):
+                    # DocumentSource 파싱
+                    doc_source = None
+                    if "document_source" in term_data:
+                        doc_source = DocumentSource(**term_data["document_source"])
+
+                    # Term 객체 생성
+                    term = Term(
+                        term_id=term_data["term_id"],
+                        term_name=term_data["term_name"],
+                        normalized_name=term_data["normalized_name"],
+                        category=term_data["category"],
+                        definition=term_data["definition"],
+                        context=term_data.get("context", ""),
+                        synonyms=term_data.get("synonyms", []),
+                        related_terms=term_data.get("related_terms", []),
+                        document_source=doc_source,
+                        confidence_score=term_data.get("confidence_score", 0.0),
+                        usage_count=term_data.get("usage_count", 0),
+                        last_updated=term_data.get("last_updated"),
+                        embedding=None
+                    )
+                    all_terms.append(term)
+
+            print(f"✓ {filename} 로드 완료: {len([t for t in all_terms if t])}개 용어")
+
+    print(f"\n총 {len(all_terms)}개 용어 로드 완료")
+
+    # 카테고리별 통계
+    category_stats = {}
+    for term in all_terms:
+        category = term.category
+        category_stats[category] = category_stats.get(category, 0) + 1
+
+    print("\n카테고리별 통계:")
+    for category, count in sorted(category_stats.items(), key=lambda x: x[1], reverse=True):
+        print(f"  - {category}: {count}개")
+    print()
+
+    return all_terms
+
+
+def test_load_document_data():
+    """관련 문서 데이터 로드 테스트"""
+    print("=" * 60)
+    print("관련 문서 데이터 로드 테스트")
+    print("=" * 60)
+
+    data_file = project_root.parent / "design/meet-ref.json"
+    if not data_file.exists():
+        print(f"❌ 파일 없음: {data_file}")
+        return []
+
+    with open(data_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    documents = []
+    for domain, doc_types in data.get("sample_data", {}).items():
+        for doc_type, docs in doc_types.items():
+            for doc_data in docs:
+                # Metadata 파싱
+                metadata = None
+                if "metadata" in doc_data:
+                    metadata = DocumentMetadata(**doc_data["metadata"])
+
+                # Document 객체 생성
+                doc = Document(
+                    document_id=doc_data["document_id"],
+                    document_type=doc_data["document_type"],
+                    business_domain=doc_data.get("business_domain"),
+                    title=doc_data["title"],
+                    content=doc_data["content"],
+                    summary=doc_data["summary"],
+                    keywords=doc_data.get("keywords", []),
+                    created_date=doc_data.get("created_date"),
+                    participants=doc_data.get("participants", []),
+                    metadata=metadata,
+                    embedding=None
+                )
+                documents.append(doc)
+
+    print(f"✓ {len(documents)}개 문서 로드 완료")
+
+    # 문서 타입별 통계
+    type_stats = {}
+    for doc in documents:
+        doc_type = doc.document_type
+        type_stats[doc_type] = type_stats.get(doc_type, 0) + 1
+
+    print("\n문서 타입별 통계:")
+    for doc_type, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
+        print(f"  - {doc_type}: {count}개")
+    print()
+
+    return documents
+
+
+def test_embedding_generation():
+    """임베딩 생성 테스트"""
+    print("=" * 60)
+    print("임베딩 생성 테스트")
+    print("=" * 60)
+
+    config = load_config(str(project_root / "config.yaml"))
+    azure_openai = config["azure_openai"]
+
+    try:
+        embedding_gen = EmbeddingGenerator(
+            api_key=azure_openai["api_key"],
+            endpoint=azure_openai["endpoint"],
+            model=azure_openai["embedding_model"],
+            dimension=azure_openai["embedding_dimension"],
+            api_version=azure_openai["api_version"]
+        )
+        print("✓ 임베딩 생성기 초기화 완료")
+
+        # 단일 임베딩 생성 테스트
+        test_text = "API는 Application Programming Interface의 약자입니다."
+        embedding = embedding_gen.generate_embedding(test_text)
+
+        print(f"✓ 단일 임베딩 생성 성공")
+        print(f"  - 차원: {len(embedding)}")
+        print(f"  - 예시 값: {embedding[:5]}")
+
+        # 배치 임베딩 생성 테스트
+        test_texts = [
+            "마이크로서비스는 소프트웨어 아키텍처 패턴입니다.",
+            "REST API는 웹 서비스 설계 방식입니다.",
+            "클라우드 네이티브는 클라우드 환경에 최적화된 애플리케이션입니다."
+        ]
+        embeddings = embedding_gen.generate_embeddings_batch(test_texts)
+
+        print(f"✓ 배치 임베딩 생성 성공")
+        print(f"  - 생성된 임베딩 수: {len(embeddings)}")
+        print(f"  - 각 임베딩 차원: {len(embeddings[0])}")
+        print()
+
+        return True
+
+    except Exception as e:
+        print(f"❌ 임베딩 생성 실패: {str(e)}")
+        print("  → Azure OpenAI API 키와 엔드포인트를 확인하세요")
+        print()
+        return False
+
+
+def main():
+    """메인 테스트 함수"""
+    print("\n" + "=" * 60)
+    print("Vector DB 데이터 로딩 테스트")
+    print("=" * 60 + "\n")
+
+    # 1. 설정 로드 테스트
+    test_load_config()
+
+    # 2. 용어 데이터 로드 테스트
+    terms = test_load_term_data()
+
+    # 3. 문서 데이터 로드 테스트
+    documents = test_load_document_data()
+
+    # 4. 임베딩 생성 테스트
+    embedding_ok = test_embedding_generation()
+
+    # 결과 요약
+    print("=" * 60)
+    print("테스트 결과 요약")
+    print("=" * 60)
+    print(f"✓ 용어 데이터: {len(terms)}개 로드")
+    print(f"✓ 문서 데이터: {len(documents)}개 로드")
+    if embedding_ok:
+        print(f"✓ 임베딩 생성: 정상")
+    else:
+        print(f"⚠ 임베딩 생성: 설정 필요 (Azure OpenAI API 키)")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()