Source code for dacy.download

"""
Functions for downloading DaCy models.
"""
import os
import shutil
from pathlib import Path
from typing import Optional

from tqdm import tqdm
from wasabi import msg


DEFAULT_CACHE_DIR = os.getenv("DACY_CACHE_DIR", os.path.join(str(Path.home()), ".dacy"))

models_url = {
    "da_dacy_small_tft-0.0.0": "https://sciencedata.dk//shared/d845d4fef9ea165ee7bd6dd954b95de2?download",
    "da_dacy_medium_tft-0.0.0": "https://sciencedata.dk//shared/c205edf59195583122d7213a3c26c077?download",
    "da_dacy_large_tft-0.0.0": "https://sciencedata.dk//shared/0da7cb975b245d9e6574458c7c89dfd9?download",
    "da_dacy_small_trf-0.1.0": "https://huggingface.co/chcaa/da_dacy_small_trf/resolve/main/da_dacy_small_trf-any-py3-none-any.whl",
    "da_dacy_medium_trf-0.1.0": "https://huggingface.co/chcaa/da_dacy_medium_trf/resolve/main/da_dacy_medium_trf-any-py3-none-any.whl",
    "da_dacy_large_trf-0.1.0": "https://huggingface.co/chcaa/da_dacy_large_trf/resolve/main/da_dacy_large_trf-any-py3-none-any.whl",
}


[docs]def models() -> list: """ Returns a list of valid DaCy models Returns: list: list of valid DaCy models """ return list(models_url.keys())
[docs]def download_model( model: str, save_path: Optional[str] = None, force: bool = False, verbose: bool = True, ) -> str: """ Downloads and install a specified DaCy pipeline. Args: model (str): string indicating DaCy model, use dacy.models() to get a list of models save_path (str, optional): The path you want to save your model to. Is only used for DaCy models of v0.0.0 as later models are installed as modules to allow for better versioning. Defaults to None denoting the default cache directory. This can be found using using dacy.where_is_my_dacy(). force (bool, optional): Should it download the model regardless of it already being present? Defaults to False. verbose (bool): Toggles the verbosity of the function. Defaults to True. Returns: a string of the model location Example: >>> download_model(model="da_dacy_medium_tft-0.0.0") """ if model in {"small", "medium", "large"}: model = f"da_dacy_{model}_trf-0.1.0" if model not in models_url: raise ValueError( "The model is not available in DaCy. Please use dacy.models() to see a list of all models" ) if int(model.split("-")[-1].split(".")[1]) < 1: # model v. 0.0.0 if save_path is None: save_path = DEFAULT_CACHE_DIR url = models_url[model] path = os.path.join(save_path, model) dl_path = os.path.join(save_path, "tmp.zip") if os.path.exists(path) and force is False: return path if verbose is True: msg.info(f"\nDownloading '{model}'") Path(save_path).mkdir(parents=True, exist_ok=True) download_url(url, dl_path) shutil.unpack_archive(dl_path, save_path) os.remove(dl_path) if verbose is True: msg.info( f"\Model successfully downloaded, you can now load it using dacy.load({model})" ) return path else: from spacy.util import get_installed_models mdl = model.split("-")[0] if mdl not in get_installed_models(): install(models_url[model]) return mdl
class DownloadProgressBar(tqdm): def update_to(self, b: int = 1, bsize: int = 1, tsize=None) -> None: if tsize is not None: self.total = tsize self.update(b * bsize - self.n) def download_url(url: str, output_path: str) -> None: import urllib.request with DownloadProgressBar( unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1] ) as t: urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
[docs]def install(package): import subprocess import sys subprocess.check_call( [sys.executable, "-m", "pip", "install", package, "--no-deps"] )