hgzero/rag/tests/test_data_loading.py

"""
데이터 로딩 및 임베딩 생성 테스트
"""
import sys
from pathlib import Path
import json

# 프로젝트 루트 디렉토리를 Python 경로에 추가
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from src.models.term import Term, DocumentSource
from src.models.document import Document, DocumentMetadata
from src.utils.config import load_config
from src.utils.embedding import EmbeddingGenerator


def test_load_config():
    """설정 로드 테스트"""
    print("=" * 60)
    print("설정 로드 테스트")
    print("=" * 60)

    config = load_config(str(project_root / "config.yaml"))
    assert config is not None
    assert "postgres" in config
    assert "azure_openai" in config
    assert "azure_search" in config
    assert "claude" in config

    print("✓ 설정 로드 성공")
    print(f"  - PostgreSQL 호스트: {config['postgres']['host']}")
    print(f"  - Azure OpenAI 모델: {config['azure_openai']['embedding_model']}")
    print(f"  - Azure Search 인덱스: {config['azure_search']['index_name']}")
    print(f"  - Claude 모델: {config['claude']['model']}")
    print()


def test_load_term_data():
    """용어 데이터 로드 테스트"""
    print("=" * 60)
    print("용어 데이터 로드 테스트")
    print("=" * 60)

    data_dir = project_root.parent / "design/aidata"
    terms_files = ["terms-01.json", "terms-02.json", "terms-03.json", "terms-04.json"]

    all_terms = []
    for filename in terms_files:
        file_path = data_dir / filename
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            for domain_data in data.get("terms", []):
                for term_data in domain_data.get("data", []):
                    # DocumentSource 파싱
                    doc_source = None
                    if "document_source" in term_data:
                        doc_source = DocumentSource(**term_data["document_source"])

                    # Term 객체 생성
                    term = Term(
                        term_id=term_data["term_id"],
                        term_name=term_data["term_name"],
                        normalized_name=term_data["normalized_name"],
                        category=term_data["category"],
                        definition=term_data["definition"],
                        context=term_data.get("context", ""),
                        synonyms=term_data.get("synonyms", []),
                        related_terms=term_data.get("related_terms", []),
                        document_source=doc_source,
                        confidence_score=term_data.get("confidence_score", 0.0),
                        usage_count=term_data.get("usage_count", 0),
                        last_updated=term_data.get("last_updated"),
                        embedding=None
                    )
                    all_terms.append(term)

            print(f"✓ {filename} 로드 완료: {len([t for t in all_terms if t])}개 용어")

    print(f"\n총 {len(all_terms)}개 용어 로드 완료")

    # 카테고리별 통계
    category_stats = {}
    for term in all_terms:
        category = term.category
        category_stats[category] = category_stats.get(category, 0) + 1

    print("\n카테고리별 통계:")
    for category, count in sorted(category_stats.items(), key=lambda x: x[1], reverse=True):
        print(f"  - {category}: {count}개")
    print()

    return all_terms


def test_load_document_data():
    """관련 문서 데이터 로드 테스트"""
    print("=" * 60)
    print("관련 문서 데이터 로드 테스트")
    print("=" * 60)

    data_file = project_root.parent / "design/meet-ref.json"
    if not data_file.exists():
        print(f"❌ 파일 없음: {data_file}")
        return []

    with open(data_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    documents = []
    for domain, doc_types in data.get("sample_data", {}).items():
        for doc_type, docs in doc_types.items():
            for doc_data in docs:
                # Metadata 파싱
                metadata = None
                if "metadata" in doc_data:
                    metadata = DocumentMetadata(**doc_data["metadata"])

                # Document 객체 생성
                doc = Document(
                    document_id=doc_data["document_id"],
                    document_type=doc_data["document_type"],
                    business_domain=doc_data.get("business_domain"),
                    title=doc_data["title"],
                    content=doc_data["content"],
                    summary=doc_data["summary"],
                    keywords=doc_data.get("keywords", []),
                    created_date=doc_data.get("created_date"),
                    participants=doc_data.get("participants", []),
                    metadata=metadata,
                    embedding=None
                )
                documents.append(doc)

    print(f"✓ {len(documents)}개 문서 로드 완료")

    # 문서 타입별 통계
    type_stats = {}
    for doc in documents:
        doc_type = doc.document_type
        type_stats[doc_type] = type_stats.get(doc_type, 0) + 1

    print("\n문서 타입별 통계:")
    for doc_type, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
        print(f"  - {doc_type}: {count}개")
    print()

    return documents


def test_embedding_generation():
    """임베딩 생성 테스트"""
    print("=" * 60)
    print("임베딩 생성 테스트")
    print("=" * 60)

    config = load_config(str(project_root / "config.yaml"))
    azure_openai = config["azure_openai"]

    try:
        embedding_gen = EmbeddingGenerator(
            api_key=azure_openai["api_key"],
            endpoint=azure_openai["endpoint"],
            model=azure_openai["embedding_model"],
            dimension=azure_openai["embedding_dimension"],
            api_version=azure_openai["api_version"]
        )
        print("✓ 임베딩 생성기 초기화 완료")

        # 단일 임베딩 생성 테스트
        test_text = "API는 Application Programming Interface의 약자입니다."
        embedding = embedding_gen.generate_embedding(test_text)

        print(f"✓ 단일 임베딩 생성 성공")
        print(f"  - 차원: {len(embedding)}")
        print(f"  - 예시 값: {embedding[:5]}")

        # 배치 임베딩 생성 테스트
        test_texts = [
            "마이크로서비스는 소프트웨어 아키텍처 패턴입니다.",
            "REST API는 웹 서비스 설계 방식입니다.",
            "클라우드 네이티브는 클라우드 환경에 최적화된 애플리케이션입니다."
        ]
        embeddings = embedding_gen.generate_embeddings_batch(test_texts)

        print(f"✓ 배치 임베딩 생성 성공")
        print(f"  - 생성된 임베딩 수: {len(embeddings)}")
        print(f"  - 각 임베딩 차원: {len(embeddings[0])}")
        print()

        return True

    except Exception as e:
        print(f"❌ 임베딩 생성 실패: {str(e)}")
        print("  → Azure OpenAI API 키와 엔드포인트를 확인하세요")
        print()
        return False


def main():
    """메인 테스트 함수"""
    print("\n" + "=" * 60)
    print("Vector DB 데이터 로딩 테스트")
    print("=" * 60 + "\n")

    # 1. 설정 로드 테스트
    test_load_config()

    # 2. 용어 데이터 로드 테스트
    terms = test_load_term_data()

    # 3. 문서 데이터 로드 테스트
    documents = test_load_document_data()

    # 4. 임베딩 생성 테스트
    embedding_ok = test_embedding_generation()

    # 결과 요약
    print("=" * 60)
    print("테스트 결과 요약")
    print("=" * 60)
    print(f"✓ 용어 데이터: {len(terms)}개 로드")
    print(f"✓ 문서 데이터: {len(documents)}개 로드")
    if embedding_ok:
        print(f"✓ 임베딩 생성: 정상")
    else:
        print(f"⚠ 임베딩 생성: 설정 필요 (Azure OpenAI API 키)")
    print("=" * 60)


if __name__ == "__main__":
    main()