Source code for scholarmetrics.metrics

"""Collection of common metrics for academic scholars."""

import numpy as np
import numpy.typing as npt

__all__ = ['euclidean', 'gindex', 'hindex']



[docs]
def euclidean(arr: npt.ArrayLike, ignore_nan: bool = True) -> float:
    """
    Calculate Euclidean index for an author.

    An Euclidean index of a vector is the square root of the sum of
    the squared elements.

    Parameters
    ----------
    arr : array-like
        Array of citations.

    ignore_nan : bool (optional, default=True)
        If True, remove nan values and return 0 if all values are nan.

    Returns
    -------
    eui : float
        Euclidean index of the author for the given citations.

    Examples
    --------
    >>> from scholarmetrics import euclidean
    >>> citations = [6, 10, 5, 46, 0, 2]
    >>> euclidean(citations)
    47.75981574503821

    Notes
    -----
    The Euclidean index was originally proposed by
    Motty Perry and Philip J. Reny [eu]_.

    References
    ----------
    .. [eu] Perry, M. and P. J. Reny (2016):
            "How to Count Citations If You Must",
            *The American Economic Review*, 106(9), pp. 2722-2241.
            DOI: 10.1257/aer.20140850
    """
    arr = _to_array(arr, ignore_nan)
    eui = np.linalg.norm(arr)
    return eui




[docs]
def gindex(arr: npt.ArrayLike) -> int:
    """
    Calculate g-index for an author.

    An g-index of x means that the author's top x publications
    together accumulated at least :math:`x^2` citations.

    Parameters
    ----------
    arr : array-like
        Array of citations.

    Returns
    -------
    gi : int
        g-index of the author for the given citations.

    Examples
    --------
    >>> from scholarmetrics import gindex
    >>> citations = [6, 10, 5, 46, 0, 2]
    >>> gindex(citations)
    6

    Notes
    -----
    The g-index was originally proposed by Leo Egghe [g]_.  It excludes
    uncited publications.  nan values are silently treated as zero values.

    References
    ----------
    .. [g] Egghe, L. (2006): "Theory and practise of the g-index",
           *Scientometrics*, 69(1), pp. 131–152.
           DOI: 10.1007/s11192-006-0144-7
    """
    arr = _to_array(arr, ignore_nan=True)
    arr = arr[np.nonzero(arr)]
    cum_sr = np.cumsum(sorted(arr, reverse=True))
    sqr_idx = [n**2 for n in range(1, len(arr) + 1)]
    gi = sum([c >= i for (c, i) in zip(cum_sr, sqr_idx)])
    return gi




[docs]
def hindex(arr: npt.ArrayLike, ignore_nan: bool = True) -> int | float:
    """
    Calculate h-index for an author.

    An h-index of x means that the author has at least x publications
    that have been cited at least x times.

    Parameters
    ----------
    arr : array-like
        Array of citations.

    ignore_nan : bool (optional, default=True)
        If True, ignore nan values and return 0 if all values are nan.

    Returns
    -------
    hi : int
        H-index of the author for the given citations.

    Examples
    --------
    >>> from scholarmetrics import hindex
    >>> citations = [6, 10, 5, 46, 0, 2]
    >>> hindex(citations)
    4

    Notes
    -----
    The h-index was originally proposed by Jorge E. Hirsch [h]_.

    References
    ----------
    .. [h] Hirsch, J. E. (2005): "An index to quantify
           an individual's scientific research output",
           *National Academy of Sciences of the USA* 102(46).
           DOI: 10.1073/pnas.0507655102
    """
    arr = _to_array(arr, ignore_nan=True)  # remove nan in any case
    if not ignore_nan and len(arr) == 0:  # return nan if all values are nan
        return np.nan
    sr = sorted(arr, reverse=True)
    idx = range(1, len(sr) + 1)
    hi = sum([p <= c for (c, p) in zip(sr, idx)])
    return hi



def _to_array(arr: npt.ArrayLike, ignore_nan: bool) -> np.ndarray:
    """Helper function to remove or replace nan values from an
    array-like object and return a cleaned numpy array.
    """
    arr = np.array(arr)
    if ignore_nan:
        return arr[np.isfinite(arr)]
    else:
        return arr