faiss_rag_enterprise/llama_index/download/module.py

"""Download."""
import json
import logging
import os
import subprocess
import sys
from enum import Enum
from importlib import util
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import pkg_resources
import requests
from pkg_resources import DistributionNotFound

from llama_index.download.utils import (
    get_exports,
    get_file_content,
    initialize_directory,
    rewrite_exports,
)

LLAMA_HUB_CONTENTS_URL = f"https://raw.githubusercontent.com/run-llama/llama-hub/main"
LLAMA_HUB_PATH = "/llama_hub"
LLAMA_HUB_URL = LLAMA_HUB_CONTENTS_URL + LLAMA_HUB_PATH

PATH_TYPE = Union[str, Path]

logger = logging.getLogger(__name__)
LLAMAHUB_ANALYTICS_PROXY_SERVER = "https://llamahub.ai/api/analytics/downloads"


class MODULE_TYPE(str, Enum):
    LOADER = "loader"
    TOOL = "tool"
    LLAMAPACK = "llamapack"
    DATASETS = "datasets"


def get_module_info(
    local_dir_path: PATH_TYPE,
    remote_dir_path: PATH_TYPE,
    module_class: str,
    refresh_cache: bool = False,
    library_path: str = "library.json",
    disable_library_cache: bool = False,
) -> Dict:
    """Get module info."""
    if isinstance(local_dir_path, str):
        local_dir_path = Path(local_dir_path)

    local_library_path = f"{local_dir_path}/{library_path}"
    module_id = None  # e.g. `web/simple_web`
    extra_files = []  # e.g. `web/simple_web/utils.py`

    # Check cache first
    if not refresh_cache and os.path.exists(local_library_path):
        with open(local_library_path) as f:
            library = json.load(f)
        if module_class in library:
            module_id = library[module_class]["id"]
            extra_files = library[module_class].get("extra_files", [])

    # Fetch up-to-date library from remote repo if module_id not found
    if module_id is None:
        library_raw_content, _ = get_file_content(
            str(remote_dir_path), f"/{library_path}"
        )
        library = json.loads(library_raw_content)
        if module_class not in library:
            raise ValueError("Loader class name not found in library")

        module_id = library[module_class]["id"]
        extra_files = library[module_class].get("extra_files", [])

        # create cache dir if needed
        local_library_dir = os.path.dirname(local_library_path)
        if not disable_library_cache:
            if not os.path.exists(local_library_dir):
                os.makedirs(local_library_dir)

            # Update cache
            with open(local_library_path, "w") as f:
                f.write(library_raw_content)

    if module_id is None:
        raise ValueError("Loader class name not found in library")

    return {
        "module_id": module_id,
        "extra_files": extra_files,
    }


def download_module_and_reqs(
    local_dir_path: PATH_TYPE,
    remote_dir_path: PATH_TYPE,
    module_id: str,
    extra_files: List[str],
    refresh_cache: bool = False,
    use_gpt_index_import: bool = False,
    base_file_name: str = "base.py",
    override_path: bool = False,
) -> None:
    """Load module."""
    if isinstance(local_dir_path, str):
        local_dir_path = Path(local_dir_path)

    if override_path:
        module_path = str(local_dir_path)
    else:
        module_path = f"{local_dir_path}/{module_id}"

    if refresh_cache or not os.path.exists(module_path):
        os.makedirs(module_path, exist_ok=True)

        basepy_raw_content, _ = get_file_content(
            str(remote_dir_path), f"/{module_id}/{base_file_name}"
        )
        if use_gpt_index_import:
            basepy_raw_content = basepy_raw_content.replace(
                "import llama_index", "import llama_index"
            )
            basepy_raw_content = basepy_raw_content.replace(
                "from llama_index", "from llama_index"
            )

        with open(f"{module_path}/{base_file_name}", "w") as f:
            f.write(basepy_raw_content)

    # Get content of extra files if there are any
    # and write them under the loader directory
    for extra_file in extra_files:
        extra_file_raw_content, _ = get_file_content(
            str(remote_dir_path), f"/{module_id}/{extra_file}"
        )
        # If the extra file is an __init__.py file, we need to
        # add the exports to the __init__.py file in the modules directory
        if extra_file == "__init__.py":
            loader_exports = get_exports(extra_file_raw_content)
            existing_exports = []
            init_file_path = local_dir_path / "__init__.py"
            # if the __init__.py file do not exists, we need to create it
            mode = "a+" if not os.path.exists(init_file_path) else "r+"
            with open(init_file_path, mode) as f:
                f.write(f"from .{module_id} import {', '.join(loader_exports)}")
                existing_exports = get_exports(f.read())
            rewrite_exports(existing_exports + loader_exports, str(local_dir_path))

        with open(f"{module_path}/{extra_file}", "w") as f:
            f.write(extra_file_raw_content)

    # install requirements
    requirements_path = f"{local_dir_path}/requirements.txt"

    if not os.path.exists(requirements_path):
        # NOTE: need to check the status code
        response_txt, status_code = get_file_content(
            str(remote_dir_path), f"/{module_id}/requirements.txt"
        )
        if status_code == 200:
            with open(requirements_path, "w") as f:
                f.write(response_txt)

    # Install dependencies if there are any and not already installed
    if os.path.exists(requirements_path):
        try:
            requirements = pkg_resources.parse_requirements(
                Path(requirements_path).open()
            )
            pkg_resources.require([str(r) for r in requirements])
        except DistributionNotFound:
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", "-r", requirements_path]
            )


def download_llama_module(
    module_class: str,
    llama_hub_url: str = LLAMA_HUB_URL,
    refresh_cache: bool = False,
    custom_dir: Optional[str] = None,
    custom_path: Optional[str] = None,
    library_path: str = "library.json",
    base_file_name: str = "base.py",
    use_gpt_index_import: bool = False,
    disable_library_cache: bool = False,
    override_path: bool = False,
    skip_load: bool = False,
) -> Any:
    """Download a module from LlamaHub.

    Can be a loader, tool, pack, or more.

    Args:
        loader_class: The name of the llama module class you want to download,
            such as `GmailOpenAIAgentPack`.
        refresh_cache: If true, the local cache will be skipped and the
            loader will be fetched directly from the remote repo.
        custom_dir: Custom dir name to download loader into (under parent folder).
        custom_path: Custom dirpath to download loader into.
        library_path: File name of the library file.
        use_gpt_index_import: If true, the loader files will use
            llama_index as the base dependency. By default (False),
            the loader files use llama_index as the base dependency.
            NOTE: this is a temporary workaround while we fully migrate all usages
            to llama_index.
        is_dataset: whether or not downloading a LlamaDataset

    Returns:
        A Loader, A Pack, An Agent, or A Dataset
    """
    # create directory / get path
    dirpath = initialize_directory(custom_path=custom_path, custom_dir=custom_dir)

    # fetch info from library.json file
    module_info = get_module_info(
        local_dir_path=dirpath,
        remote_dir_path=llama_hub_url,
        module_class=module_class,
        refresh_cache=refresh_cache,
        library_path=library_path,
        disable_library_cache=disable_library_cache,
    )
    module_id = module_info["module_id"]
    extra_files = module_info["extra_files"]

    # download the module, install requirements
    download_module_and_reqs(
        local_dir_path=dirpath,
        remote_dir_path=llama_hub_url,
        module_id=module_id,
        extra_files=extra_files,
        refresh_cache=refresh_cache,
        use_gpt_index_import=use_gpt_index_import,
        base_file_name=base_file_name,
        override_path=override_path,
    )
    if skip_load:
        return None

    # loads the module into memory
    if override_path:
        path = f"{dirpath}/{base_file_name}"
        spec = util.spec_from_file_location("custom_module", location=path)
        if spec is None:
            raise ValueError(f"Could not find file: {path}.")
    else:
        path = f"{dirpath}/{module_id}/{base_file_name}"
        spec = util.spec_from_file_location("custom_module", location=path)
        if spec is None:
            raise ValueError(f"Could not find file: {path}.")

    module = util.module_from_spec(spec)
    spec.loader.exec_module(module)  # type: ignore

    return getattr(module, module_class)


def track_download(module_class: str, module_type: str) -> None:
    """Tracks number of downloads via Llamahub proxy.

    Args:
        module_class: The name of the llama module being downloaded, e.g.,`GmailOpenAIAgentPack`.
        module_type: Can be "loader", "tool", "llamapack", or "datasets"
    """
    try:
        requests.post(
            LLAMAHUB_ANALYTICS_PROXY_SERVER,
            json={"type": module_type, "plugin": module_class},
        )
    except Exception as e:
        logger.info(f"Error tracking downloads for {module_class} : {e}")