import os
from dotenv import load_dotenv
from llama_index.readers.web import BeautifulSoupWebReader
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex, StorageContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
import chromadb

def load_text_content(text_content: str):
    """
    Load text content directly as a Document.
    """
    document = Document(text=text_content)
    return [document]

def load_youtube_transcripts(youtube_links: list[str]):
    """
    Load transcripts from YouTube videos as Document objects.
    """
    youtube_reader = YoutubeTranscriptReader()
    documents = youtube_reader.load_data(ytlinks=youtube_links)
    return documents

def main():
    # Load environment variables from .env file
    load_dotenv()

    # Initialize Gemini LLM and embedding model
    llm = Gemini(api_key=os.environ["GEMINI_API_KEY"])
    embed_model = GeminiEmbedding(model_name="models/embedding-001")

    # Set global settings
    Settings.llm = llm
    Settings.embed_model = embed_model
    Settings.chunk_size = 1024

    # Initialize ChromaDB client and collection
    chroma_client = chromadb.PersistentClient(path="../chroma_db")
    chroma_collection = chroma_client.get_or_create_collection(name="kafu")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

    # --- Load Documents ---
    # 1. Web Documents
    DATA_URL = [
        "https://www.kafu.ac.ke",
        "https://kafu.ac.ke/index.php/admin/information-to-2024-2025-first-year-students"
    ]
    web_loader = BeautifulSoupWebReader()
    web_docs = web_loader.load_data(urls=DATA_URL)
    print(f"Loaded {len(web_docs)} web documents.")

    # 2. Local Documents
    # Provide file paths for local files (adjust paths as needed)
    local_loader = SimpleDirectoryReader(input_files=["./data/fee.pdf"])
    local_docs = local_loader.load_data()
    print(f"Loaded {len(local_docs)} local documents.")

    # 3. Direct Text Content
    text_content = "This is a sample text document content for testing direct text input."
    text_docs = load_text_content(text_content)
    print(f"Loaded {len(text_docs)} text document.")

    # 4. YouTube Transcripts
    youtube_links = [
        "https://youtu.be/WeyDovgSen4",
    ]
    youtube_docs = load_youtube_transcripts(youtube_links)
    print(f"Loaded {len(youtube_docs)} YouTube transcript documents.")

    # Combine all documents
    documents = web_docs + local_docs + text_docs
    print(f"Total documents loaded: {len(documents)}")

    # --- Train the Vector Store ---
    # Create a StorageContext using the ChromaVectorStore
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Create an index from the combined documents
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        embed_model=embed_model
    )

    # Set up query engine
    query_engine = index.as_query_engine()

    # --- Execute a Query ---
    query = "Difference of react and next js"
    response = query_engine.query(query)

    print(f"\nQuery: {query}\nResponse:\n{response}")

if __name__ == "__main__":
    main()
