Source code for dacy.datasets.names

"""
Helper functions for loading name dictionaries for person augmentation.
"""

import os
from typing import Dict, List, Optional
import pandas as pd


[docs]def load_names(
    min_count: int = 0,
    ethnicity: Optional[str] = None,
    gender: Optional[str] = None,
    min_prop_gender: float = 0,
) -> Dict[str, List[str]]:
    """
    Loads the names lookup table. Danish are from Danmarks statistik (2021).
    Muslim names are from Meldgaard (2005), https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/.

    Args:
        min_count (int, optional): Minimum number of occurences of the name for it to be included.
            Defaults to 0.
        ethnicity (Optional[str], optional): Which ethnicity should be included. None indicate all is
            included. Options include "muslim", "danish". Defaults to None.
        gender (Optional[str], optional): Which gender should be included. None indicate all is included.
            Options include "male", "female". Defaults to None.
        min_prop_gender (float): minimum probability of a name being a given gender. The probability of a
            given name being a
            specific gender is based on the proportion of people with the given name of that gender. Only
            used when gender is set. Defaults to 0.

    Returns:
        Dict[str, List[str]]: A dictionary of Muslim names containing the keys "first_name" and "last_name".
    """
    path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "lookup_tables", "names.csv"
    )
    names = pd.read_csv(path)

    if min_count:
        names = names.loc[names["count"] >= min_count]

    if ethnicity is not None:
        names = names.loc[names["ethnicity"] == ethnicity]

    last_names = names.loc[names["first_name"] == False]
    if gender is not None:
        names = names.groupby(["name", "gender", "first_name"]).agg({"count": "sum"})
        # Change: groupby state_office and divide by sum
        names = names.groupby(level=0).apply(lambda x: x / float(x.sum()))
        names = names.reset_index()
        names = names.loc[
            (names["gender"] == gender) & (names["count"] >= min_prop_gender)
        ]

    first_names = names.loc[names["first_name"] == True]
    return {
        "first_name": first_names.name.tolist(),
        "last_name": last_names.name.tolist(),
    }


[docs]def muslim_names() -> Dict[str, List[str]]:
    """Returns a dictionary of Muslim names.

    Returns:
        dict: A dictionary of Muslim names containing the keys "first_name" and "last_name". The list is derived from Meldgaard (2005),
            https://nors.ku.dk/publikationer/webpublikationer/muslimske_fornavne/.

    Example:
        >>> from dacy.datasets import muslim_names
        >>> names = muslim_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="muslim")


[docs]def danish_names() -> Dict[str, List[str]]:
    """Returns a dictionary of Danish names.

    Returns:
        dict: A dictionary of Danish names containing the keys "first_name" and "last_name". The list is derived from Danmarks statistik (2021).

    Example:
        >>> from dacy.datasets import danish_names
        >>> names = danish_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="danish")


[docs]def female_names() -> Dict[str, List[str]]:
    """Returns a dictionary of Danish female names.

    Returns:
        dict: A dictionary of names containing the keys "first_name" and "last_name". The list is derived from Danmarks statistik (2021).

    Example:
        >>> from dacy.datasets import female_names
        >>> names = female_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="danish", gender="female", min_prop_gender=0.5)


[docs]def male_names() -> Dict[str, List[str]]:
    """Returns a dictionary of Danish male names.

    Returns:
        dict: A dictionary of names containing the keys "first_name" and "last_name". The list is derived from Danmarks statistik (2021).

    Example:
        >>> from dacy.datasets import male_names
        >>> names = male_names()
        >>> names["first_name"]
        >>> names["last_name"]
    """
    return load_names(ethnicity="danish", gender="male", min_prop_gender=0.5)