Source code for dacy.datasets.dane

"""
This includes the DaNE dataset wrapped and read in as a SpaCy corpus.
"""

import os
from pathlib import Path
import shutil
from typing import List, Optional, Union

from spacy.training import Corpus

from ..download import DEFAULT_CACHE_DIR, download_url
from .constants import DATASETS


[docs]def dane( save_path: Optional[str] = None, splits: List[str] = ["train", "dev", "test"], redownload: bool = False, n_sents: int = 1, open_unverified_connection: bool = False, **kwargs, ) -> Union[List[Corpus], Corpus]: """ Reads the DaNE dataset as a spacy Corpus. Args: save_path (str, optional): Path to the DaNE dataset If it does not contain the dataset it is downloaded to the folder. Defaults to None corresponding to dacy.where_is_my_dacy() in the datasets subfolder. splits (List[str], optional): Which splits of the dataset should be returned. Possible options include "train", "dev", "test", "all". Defaults to ["train", "dev", "test"]. redownload (bool, optional): Should the dataset be redownloaded. Defaults to False. n_sents (int, optional): Number of sentences per document. Only applied if the dataset is downloaded. Defaults to 1. open_unverified_connection (bool, optional): Should you download from an unverified connection. Defaults to False. force_extension (bool, optional): Set the extension to the doc regardless of whether it already exists. Defaults to False. Returns: Union[List[Corpus], Corpus]: Returns a SpaCy corpus or a list thereof. Example: >>> from dacy.datasets import dane >>> train, dev, test = dane(splits=["train", "dev", "test"]) """ if open_unverified_connection: import ssl ssl._create_default_https_context = ssl._create_unverified_context if save_path is None: save_path_ = os.path.join(DEFAULT_CACHE_DIR, "datasets") else: save_path_ = save_path save_path = os.path.join(save_path_, "dane") if ( (not os.path.isdir(save_path)) or ("dane" not in os.listdir(save_path_)) or (redownload is True) ): Path(save_path).mkdir(parents=True, exist_ok=True) dl_path = os.path.join(save_path, "dane.zip") download_url(DATASETS["dane"], dl_path) shutil.unpack_archive(dl_path, save_path) os.remove(dl_path) wpaths = [ "dane_train.conllu", "dane_dev.conllu", "dane_test.conllu", "dane.conllu", ] for wpath in wpaths: wpath = os.path.join(save_path, wpath) cpath = wpath[:-7] + f"_{n_sents}" if os.path.isfile(cpath + ".spacy"): continue cpath += ".conllu" shutil.copyfile(wpath, cpath) # convert to spacy os.system( f"python -m spacy convert {cpath} {save_path} --converter conllu --merge-subtokens -n {n_sents}" ) os.remove(cpath) if isinstance(splits, str): splits = [splits] corpora = [] paths = { "all": f"dane_{n_sents}.spacy", "test": f"dane_test_{n_sents}.spacy", "dev": f"dane_dev_{n_sents}.spacy", "train": f"dane_train_{n_sents}.spacy", } for split in splits: corpora.append(Corpus(os.path.join(save_path, paths[split]))) if len(corpora) == 1: return corpora[0] return corpora