Source code for tableone.tableone

"""
The tableone package is used for creating "Table 1" summary statistics for
research papers.
"""
from typing import Optional, Union
import warnings

import numpy as np
import pandas as pd
from tabulate import tabulate

from tableone.deprecations import handle_deprecated_parameters
from tableone.preprocessors import (ensure_list, detect_categorical, order_categorical,
                                    get_groups, handle_categorical_nulls)
from tableone.statistics import Statistics
from tableone.tables import Tables
from tableone.validators import DataValidator, InputValidator


def load_dataset(name: str) -> pd.DataFrame:
    """
    Load an example dataset from the online repository (requires internet).

    These datasets are useful for documentation and testing.

    Parameters
    ----------
    name : str
        Name of the dataset.

    Returns
    -------
    df : :class:`pandas.DataFrame`
        Tabular data.
    """
    path = ("https://raw.githubusercontent.com/"
            "tompollard/tableone/master/datasets/{}.csv")
    full_path = path.format(name)

    df = pd.read_csv(full_path)

    return df


def docstring_copier(*sub):
    """
    Wrap the TableOne docstring (not ideal :/)
    """
    def dec(obj):
        obj.__doc__ = obj.__doc__.format(*sub)
        return obj
    return dec



[docs]
class TableOne:
    """

    If you use the tableone package, please cite:

    Pollard TJ, Johnson AEW, Raffa JD, Mark RG (2018). tableone: An open source
    Python package for producing summary statistics for research papers.
    JAMIA Open, Volume 1, Issue 1, 1 July 2018, Pages 26-31.
    https://doi.org/10.1093/jamiaopen/ooy012

    Create an instance of the tableone summary table.

    Parameters
    ----------
    data : pandas DataFrame
        The dataset to be summarised. Rows are observations, columns are
        variables.
    columns : list, optional
        List of columns in the dataset to be included in the final table.
        Setting the argument to None will include all columns by default.
    categorical : list, optional
        List of columns that contain categorical variables.
        If the argument is set to None (or omitted), we attempt to detect
        categorical variables. Set to an empty list to indicate explicitly
        that there are no variables of this type to be included.
    continuous : list, optional
        List of columns that contain continuous variables.
        If the argument is set to None (or omitted), we attempt to detect
        continuous variables. Set to an empty list to indicate explicitly
        that there are no variables of this type to be included.
    groupby : str, optional
        Optional column for stratifying the final table (default: None).
    nonnormal : list, optional
        List of columns that contain non-normal variables (default: None).
    min_max: list, optional
        List of variables that should report minimum and maximum, instead of
        standard deviation (for normal) or Q1-Q3 (for non-normal).
    pval : bool, optional
        Display computed P-Values (default: False).
    pval_adjust : str, optional
        Method used to adjust P-Values for multiple testing.
        The P-values from the unadjusted table (default when pval=True)
        are adjusted to account for the number of total tests that were
        performed.
        These adjustments would be useful when many variables are being
        screened to assess if their distribution varies by the variable in the
        groupby argument.
        For a complete list of methods, see documentation for statsmodels
        multipletests.
        Available methods include ::

        `None` : no correction applied.
        `bonferroni` : one-step correction
        `sidak` : one-step correction
        `holm-sidak` : step down method using Sidak adjustments
        `simes-hochberg` : step-up method (independent)
        `hommel` : closed method based on Simes tests (non-negative)

    htest_name : bool, optional
        Display a column with the names of hypothesis tests (default: False).
    htest : dict, optional
        Dictionary of custom hypothesis tests. Keys are variable names and
        values are functions. Functions must take a list of Numpy Arrays as
        the input argument and must return a test result.
        e.g. htest = {'age': myfunc}
    missing : bool, optional
        Display a count of null values (default: True).
    ddof : int, optional
        Degrees of freedom for standard deviation calculations (default: 1).
    rename : dict, optional
        Dictionary of alternative names for variables.
        e.g. `rename = {'sex':'gender', 'trt':'treatment'}`
    sort : bool or str, optional
        If `True`, sort the variables alphabetically. If a string
        (e.g. `'P-Value'`), sort by the specified column in ascending order.
        Default (`False`) retains the sequence specified in the `columns`
        argument. Currently the only columns supported are: `'Missing'`,
        `'P-Value'`, `'P-Value (adjusted)'`, and `'Test'`.
    limit : int or dict, optional
        Limit to the top N most frequent categories. If int, apply to all
        categorical variables. If dict, apply to the key (e.g. {'sex': 1}).
    order : dict, optional
        Specify an order for categorical variables. Key is the variable, value
        is a list of values in order.  {e.g. 'sex': ['f', 'm', 'other']}
    label_suffix : bool, optional
        Append summary type (e.g. "mean (SD); median [Q1,Q3], n (%); ") to the
        row label (default: True).
    decimals : int or dict, optional
        Number of decimal places to display. An integer applies the rule to all
        variables (default: 1). A dictionary (e.g. `decimals = {'age': 0)`)
        applies the rule per variable, defaulting to 1 place for unspecified
        variables. For continuous variables, applies to all summary statistics
        (e.g. mean and standard deviation). For categorical variables, applies
        to percentage only.
    overall : bool, optional
        If True, add an "overall" column to the table. Smd and p-value
        calculations are performed only using stratified columns.
    row_percent : bool, optional
        If True, compute "n (%)" percentages for categorical variables across
        "groupby" rows rather than columns.
    display_all : bool, optional
        If True, set pd. display_options to display all columns and rows.
        (default: False)
    dip_test : bool, optional
        Run Hartigan's Dip Test for multimodality. If variables are found to
        have multimodal distributions, a remark will be added below the
        Table 1.
        (default: False)
    normal_test : bool, optional
        Test the null hypothesis that a sample come from a normal distribution.
        Uses scipy.stats.normaltest. If variables are found to have non-normal
        distributions, a remark will be added below the Table 1.
        (default: False)
    tukey_test : bool, optional
        Run Tukey's test for far outliers. If variables are found to
        have far outliers, a remark will be added below the Table 1.
        (default: False)
    include_null : bool, optional
        Include None/Null values for categorical variables by treating them as a
        category level. (default: True)


    Attributes
    ----------
    tableone : dataframe
        Summary of the data (i.e., the "Table 1").

    Examples
    --------
    >>> df = pd.DataFrame({'size': [1, 2, 60, 1, 1],
    ...                   'fruit': ['peach', 'orange', 'peach', 'peach',
    ...                             'orange'],
    ...                   'tasty': ['yes', 'yes', 'no', 'yes', 'no']})

    >>> df
       size   fruit tasty
    0     1   peach   yes
    1     2  orange   yes
    2    60   peach    no
    3     1   peach   yes
    4     1  orange    no

    >>> TableOne(df, overall=False, groupby="fruit", pval=True)

                        Grouped by fruit
                                 Missing     orange        peach P-Value
    n                                             2            3
    size, mean (SD)                    0  1.5 (0.7)  20.7 (34.1)   0.433
    tasty, n (%)    no                 0   1 (50.0)     1 (33.3)   1.000
                    yes                    1 (50.0)     2 (66.7)

    ...
    """
    def __init__(self, data: pd.DataFrame,
                 columns: Optional[list] = None,
                 categorical: Optional[list] = None,
                 continuous: Optional[list] = None,
                 groupby: Optional[str] = None,
                 nonnormal: Optional[list] = None,
                 min_max: Optional[list] = None, pval: Optional[bool] = False,
                 pval_adjust: Optional[str] = None, htest_name: bool = False,
                 pval_test_name: bool = False, htest: Optional[dict] = None,
                 isnull: Optional[bool] = None, missing: bool = True,
                 ddof: int = 1, labels: Optional[dict] = None,
                 rename: Optional[dict] = None, sort: Union[bool, str] = False,
                 limit: Union[int, dict, None] = None,
                 order: Optional[dict] = None, remarks: bool = False,
                 label_suffix: bool = True, decimals: Union[int, dict] = 1,
                 smd: bool = False, overall: bool = True,
                 row_percent: bool = False, display_all: bool = False,
                 dip_test: bool = False, normal_test: bool = False,
                 tukey_test: bool = False,
                 pval_threshold: Optional[float] = None,
                 include_null: Optional[bool] = True) -> None:

        # Warn about deprecated parameters
        handle_deprecated_parameters(labels, isnull, pval_test_name, remarks)

        # Attach submodules
        self.statistics = Statistics()
        self.tables = Tables()

        # Initialize attributes
        data = self.initialize_core_attributes(data, columns, categorical, continuous, groupby,
                                               nonnormal, min_max, pval, pval_adjust, htest_name,
                                               htest, missing, ddof, rename, sort, limit, order,
                                               label_suffix, decimals, smd, overall, row_percent,
                                               dip_test, normal_test, tukey_test, pval_threshold,
                                               include_null)

        # Initialize intermediate tables
        self.initialize_intermediate_tables()

        # Set up validators and validate data
        self.setup_validators()
        self.validate_data(data)

        # Create all intermediate tables
        self.create_intermediate_tables(data)

        # Assemble Table 1
        self.tableone = self._create_tableone(data)

        # wrap dataframe methods
        self.head = self.tableone.head
        self.tail = self.tableone.tail
        self.to_csv = self.tableone.to_csv
        self.to_excel = self.tableone.to_excel
        self.to_html = self.tableone.to_html
        self.to_json = self.tableone.to_json
        self.to_latex = self.tableone.to_latex

        # set display options
        if display_all:
            self._set_display_options()

    def __str__(self) -> str:
        return self.tableone.to_string() + self._generate_remarks('\n')

    def __repr__(self) -> str:
        return self.tableone.to_string() + self._generate_remarks('\n')

    def _repr_html_(self) -> str:
        return self.tableone._repr_html_() + self._generate_remarks('<br />')

    def initialize_core_attributes(self, data, columns, categorical, continuous, groupby,
                                   nonnormal, min_max, pval, pval_adjust, htest_name,
                                   htest, missing, ddof, rename, sort, limit, order,
                                   label_suffix, decimals, smd, overall, row_percent, 
                                   dip_test, normal_test, tukey_test, pval_threshold,
                                   include_null):
        """
        Initialize attributes.
        """
        self._alt_labels = rename
        self._include_null = include_null
        self._columns = columns if columns else data.columns.to_list()  # type: ignore
        self._categorical = detect_categorical(data[self._columns], groupby) if categorical is None else categorical
        if continuous:
            self._continuous = continuous
        else:
            self._continuous = [c for c in self._columns if c not in self._categorical + [groupby]]  # type: ignore
        self._ddof = ddof
        self._decimals = decimals
        self._dip_test = dip_test
        self._groupby = groupby
        self._htest = htest
        self._isnull = missing
        self._label_suffix = label_suffix
        self._limit = limit
        self._min_max = min_max
        self._nonnormal = ensure_list(nonnormal, arg_name="nonnormal")  # type: ignore
        self._normal_test = normal_test
        self._order = order_categorical(data, order)
        self._overall = overall
        self._pval = pval
        self._pval_adjust = pval_adjust
        self._pval_test_name = htest_name
        self._pval_threshold = pval_threshold
        self._reserved_columns = ['Missing', 'P-Value', 'Test', 'P-Value (adjusted)', 'SMD', 'Overall']
        self._row_percent = row_percent
        self._smd = smd
        self._sort = sort
        self._tukey_test = tukey_test
        self._warnings = {}

        if self._categorical and self._include_null:
            data[self._categorical] = handle_categorical_nulls(data[self._categorical])

        self._groupbylvls = get_groups(data, self._groupby, self._order, self._reserved_columns)

        return data

    def initialize_intermediate_tables(self):
        """
        Initialize the intermediate tables.
        """
        # Intermediate tables
        self.htest_table = None
        self.cat_describe_all = None
        self.cont_describe_all = None
        self.cat_describe = None
        self.cont_describe = None
        self.smd_table = None
        self.cat_table = None
        self.cont_table = None

    def setup_validators(self):
        self.data_validator = DataValidator()
        self.input_validator = InputValidator()

    def validate_data(self, data):
        self.input_validator.validate(self._groupby, self._nonnormal, self._min_max,  # type: ignore
                                      self._pval_adjust, self._order, self._pval,  # type: ignore
                                      self._columns, self._categorical, self._continuous)  # type: ignore
        self.data_validator.validate(data, self._columns, self._categorical, self._include_null)  # type: ignore

    def create_intermediate_tables(self, data):
        """
        Creates all intermediate tables.
        """
        # forgive me jraffa
        if self._pval:
            self.htest_table = self.tables.create_htest_table(data, self._continuous, self._categorical,
                                                              self._nonnormal, self._groupby,
                                                              self._groupbylvls, self._htest,
                                                              self._pval, self._pval_adjust)

        # create overall tables if required
        if self._categorical and self._groupby and self._overall:
            self.cat_describe_all = self.tables.create_cat_describe(data,
                                                                    self._categorical,
                                                                    self._decimals,
                                                                    self._row_percent,
                                                                    self._include_null,
                                                                    groupby=None,
                                                                    groupbylvls=['Overall'])

        if self._continuous and self._groupby and self._overall:
            self.cont_describe_all = self.tables.create_cont_describe(data,
                                                                      self._ddof,
                                                                      self._t1_summary,
                                                                      self._dip_test,
                                                                      self._tukey_test,
                                                                      self._normal_test,
                                                                      self._continuous,
                                                                      groupby=None)

        # create descriptive tables
        if self._categorical:
            self.cat_describe = self.tables.create_cat_describe(data,
                                                                self._categorical,
                                                                self._decimals,
                                                                self._row_percent,
                                                                self._include_null,
                                                                groupby=self._groupby,
                                                                groupbylvls=self._groupbylvls)

        if self._continuous:
            self.cont_describe = self.tables.create_cont_describe(data,
                                                                  self._ddof,
                                                                  self._t1_summary,
                                                                  self._dip_test,
                                                                  self._tukey_test,
                                                                  self._normal_test,
                                                                  self._continuous,
                                                                  groupby=self._groupby)

        # compute standardized mean differences
        if self._smd:
            self.smd_table = self.tables.create_smd_table(self._groupbylvls,
                                                          self._continuous,
                                                          self._categorical,
                                                          self.cont_describe,
                                                          self.cat_describe)

        # create continuous and categorical tables
        if self._categorical:
            self.cat_table = self.tables.create_cat_table(data,
                                                          self._overall,
                                                          self.cat_describe,
                                                          self._categorical,
                                                          self._include_null,
                                                          self._pval,
                                                          self._pval_adjust,
                                                          self.htest_table,
                                                          self._smd,
                                                          self.smd_table,
                                                          self._groupby,
                                                          self.cat_describe_all)

        if self._continuous:
            self.cont_table = self.tables.create_cont_table(data,
                                                            self._overall,
                                                            self.cont_describe,
                                                            self.cont_describe_all,
                                                            self._continuous,
                                                            self._pval,
                                                            self._pval_adjust,
                                                            self.htest_table,
                                                            self._smd,
                                                            self.smd_table,
                                                            self._groupby)

    def _set_display_options(self):
        """
        Set pandas display options. Display all rows and columns by default.
        """
        display_options = {'display.max_rows': None,
                           'display.max_columns': None,
                           'display.width': None,
                           'display.max_colwidth': None}

        for k in display_options:
            try:
                pd.set_option(k, display_options[k])
            except ValueError:
                msg = """Newer version of Pandas required to set the '{}'
                         option.""".format(k)
                warnings.warn(msg)

    def tabulate(self, headers=None, tablefmt='grid', **kwargs) -> str:
        """
        Pretty-print tableone data. Wrapper for the Python 'tabulate' library.

        Args:
            headers (list): Defines a list of column headers to be used.
            tablefmt (str): Defines how the table is formatted. Table formats
                include: 'plain','simple','github','grid','fancy_grid','pipe',
                'orgtbl','jira','presto','psql','rst','mediawiki','moinmoin',
                'youtrack','html','latex','latex_raw','latex_booktabs',
                and 'textile'.

        Examples:
            To output tableone in github syntax, call tabulate with the
                'tablefmt="github"' argument.

            >>> print(tableone.tabulate(tablefmt='fancy_grid'))
        """
        # reformat table for tabulate
        df = self.tableone

        if not headers:
            try:
                headers = df.columns.levels[1]
            except AttributeError:
                headers = df.columns

        df = df.reset_index()
        df = df.set_index('level_0')
        isdupe = df.index.duplicated()
        df.index = df.index.where(~isdupe, '')
        df = df.rename_axis(None).rename(columns={'level_1': ''})

        return tabulate(df, headers=headers, tablefmt=tablefmt, **kwargs)

    def _generate_remarks(self, newline='\n') -> str:
        """
        Generate a series of remarks that the user should consider
        when interpreting the summary statistics.
        """
        if self.cont_describe is not None:
            # generate warnings for continuous variables
            if self._continuous and self._tukey_test:
                # highlight far outliers
                outlier_mask = self.cont_describe.far_outliers > 1
                outlier_vars = list(self.cont_describe.far_outliers[outlier_mask].
                                    dropna(how='all').index)
                if outlier_vars:
                    self._warnings["""Tukey test indicates far outliers
                                    in"""] = outlier_vars

            if self._continuous and self._dip_test:
                # highlight possible multimodal distributions using hartigan's dip
                # test -1 values indicate NaN
                modal_mask = ((self.cont_describe.hartigan_dip >= 0) &
                            (self.cont_describe.hartigan_dip <= 0.05))
                modal_vars = list(self.cont_describe.hartigan_dip[modal_mask].
                                dropna(how='all').index)
                if modal_vars:
                    self._warnings["""Hartigan's Dip Test reports possible
                                    multimodal distributions for"""] = modal_vars

            if self._continuous and self._normal_test:
                # highlight non normal distributions
                # -1 values indicate NaN
                modal_mask = ((self.cont_describe.normality >= 0) &
                            (self.cont_describe.normality <= 0.001))
                modal_vars = list(self.cont_describe.normality[modal_mask].
                                dropna(how='all').index)
                if modal_vars:
                    self._warnings["""Normality test reports non-normal
                                    distributions for"""] = modal_vars

            # create the warning string
            msg = '{}'.format(newline)
            for n, k in enumerate(sorted(self._warnings)):
                msg += '[{}] {}: {}.{}'.format(n+1, k, ', '.join(self._warnings[k]), newline)
        else:
            msg = ""

        return msg

    def _t1_summary(self, x: pd.Series) -> str:
        """
        Compute median [IQR] or mean (Std) for the input series.

        Parameters
        ----------
            x : pandas Series
                Series of values to be summarised.
        """
        # set decimal places
        if isinstance(self._decimals, int):
            n = self._decimals
        elif isinstance(self._decimals, dict):
            try:
                n = self._decimals[x.name]
            except KeyError:
                n = 1
        else:
            n = 1
            msg = """The decimals arg must be an int or dict.
                     Defaulting to {} d.p.""".format(n)
            warnings.warn(msg)

        if x.name in self._nonnormal:
            f = "{{:.{}f}} [{{:.{}f}},{{:.{}f}}]".format(n, n, n)
            if self._min_max and x.name in self._min_max:
                return f.format(
                    np.nanmedian(x.values), np.nanmin(x.values),  # type: ignore
                    np.nanmax(x.values),  # type: ignore
                )
            else:
                return f.format(
                    np.nanmedian(x.values),  # type: ignore
                    np.nanpercentile(x.values, 25),  # type: ignore
                    np.nanpercentile(x.values, 75),  # type: ignore
                )
        else:
            if self._min_max and x.name in self._min_max:
                f = "{{:.{}f}} [{{:.{}f}},{{:.{}f}}]".format(n, n, n)
                return f.format(
                    np.nanmean(x.values), np.nanmin(x.values),  # type: ignore
                    np.nanmax(x.values),  # type: ignore
                )
            else:
                f = '{{:.{}f}} ({{:.{}f}})'.format(n, n)
                return f.format(np.nanmean(x.values), self.statistics._std(x, self._ddof))  # type: ignore

    def _create_tableone(self, data):
        """
        Create table 1 by combining the continuous and categorical tables.

        Returns
        ----------
        table : pandas DataFrame
            The complete table one.
        """
        if self._continuous and self._categorical:
            # support pandas<=0.22
            try:
                table = pd.concat([self.cont_table, self.cat_table],
                                  sort=False)
            except TypeError:
                table = pd.concat([self.cont_table, self.cat_table])
        elif self._continuous:
            table = self.cont_table
        elif self._categorical:
            table = self.cat_table

        # ensure column headers are strings before reindexing
        table = table.reset_index().set_index(['variable', 'value'])  # type: ignore
        table.columns = table.columns.values.astype(str)

        # sort the table rows
        sort_columns = ['Missing', 'P-Value', 'P-Value (adjusted)', 'Test']
        if self._smd and self.smd_table is not None:
            sort_columns = sort_columns + list(self.smd_table.columns)

        if self._sort and isinstance(self._sort, bool):
            new_index = sorted(table.index.values, key=lambda x: x[0].lower())
        elif self._sort and isinstance(self._sort, str) and (self._sort in
                                                             sort_columns):
            try:
                new_index = table.sort_values(self._sort).index
            except KeyError:
                new_index = sorted(table.index.values,
                                   key=lambda x: self._columns.index(x[0]))
                warnings.warn('Sort variable not found: {}'.format(self._sort))
        elif self._sort and isinstance(self._sort, str) and (self._sort not in
                                                             sort_columns):
            new_index = sorted(table.index.values,
                               key=lambda x: self._columns.index(x[0]))
            warnings.warn('Sort must be in the following ' +
                          'list: {}.'.format(self._sort))
        else:
            # sort by the columns argument
            new_index = sorted(table.index.values,
                               key=lambda x: self._columns.index(x[0]))
        table = table.reindex(new_index)

        # round pval column and convert to string
        if self._pval and self._pval_adjust:
            if self._pval_threshold:
                asterisk_mask = table['P-Value (adjusted)'] < self._pval_threshold

            table['P-Value (adjusted)'] = table['P-Value (adjusted)'].apply(
                                                '{:.3f}'.format).astype(str)
            table.loc[table['P-Value (adjusted)'] == '0.000',
                      'P-Value (adjusted)'] = '<0.001'

            if self._pval_threshold:
                table.loc[asterisk_mask, 'P-Value (adjusted)'] = table['P-Value (adjusted)'][asterisk_mask].astype(str)+"*"  # type: ignore

        elif self._pval:
            if self._pval_threshold:
                asterisk_mask = table['P-Value'] < self._pval_threshold

            table['P-Value'] = table['P-Value'].apply(
                                     '{:.3f}'.format).astype(str)
            table.loc[table['P-Value'] == '0.000', 'P-Value'] = '<0.001'

            if self._pval_threshold:
                table.loc[asterisk_mask, 'P-Value'] = table['P-Value'][asterisk_mask].astype(str)+"*"  # type: ignore

        # round smd columns and convert to string
        if self._smd and self.smd_table is not None:
            for c in list(self.smd_table.columns):
                table[c] = table[c].apply('{:.3f}'.format).astype(str)
                table.loc[table[c] == '0.000', c] = '<0.001'

        # if an order is specified, apply it
        if self._order:
            for k in self._order:

                # Skip if the variable isn't present
                try:
                    all_var = table.loc[k].index.unique(level='value')
                except KeyError:
                    if k not in self._groupby:  # type: ignore
                        warnings.warn("Order variable not found: {}".format(k))
                    continue

                # Remove value from order if it is not present
                if [i for i in self._order[k] if i not in all_var]:
                    rm_var = [i for i in self._order[k] if i not in all_var]
                    self._order[k] = [i for i in self._order[k]
                                      if i in all_var]
                    warnings.warn(("Order value not found: "
                                   "{}: {}").format(k, rm_var))

                new_seq = [(k, '{}'.format(v)) for v in self._order[k]]
                new_seq += [(k, '{}'.format(v)) for v in all_var
                            if v not in self._order[k]]

                # restructure to match the original idx
                new_idx_array = np.empty((len(new_seq),), dtype=object)
                new_idx_array[:] = [tuple(i) for i in new_seq]
                orig_idx = table.index.values.copy()
                orig_idx[table.index.get_loc(k)] = new_idx_array
                table = table.reindex(orig_idx)

        # set the limit on the number of categorical variables
        if self._limit:
            levelcounts = data[self._categorical].nunique()

            for k, _ in levelcounts.items():
                # set the limit for the variable
                if (isinstance(self._limit, int)
                        and levelcounts[k] >= self._limit):
                    limit = self._limit
                elif isinstance(self._limit, dict) and k in self._limit:
                    limit = self._limit[k]
                else:
                    continue

                if not self._order or (self._order and k not in self._order):
                    # re-order the variables by frequency
                    count = data[k].value_counts().sort_values(ascending=False)
                    new_idx = [(k, '{}'.format(i)) for i in count.index]
                else:
                    # apply order
                    all_var = table.loc[k].index.unique(level='value')
                    new_idx = [(k, '{}'.format(v)) for v in self._order[k]]
                    new_idx += [(k, '{}'.format(v)) for v in all_var
                                if v not in self._order[k]]

                # restructure to match the original idx
                new_idx_array = np.empty((len(new_idx),), dtype=object)
                new_idx_array[:] = [tuple(i) for i in new_idx]
                orig_idx = table.index.values.copy()
                orig_idx[table.index.get_loc(k)] = new_idx_array
                table = table.reindex(orig_idx)

                # drop the rows > the limit
                table = table.drop(new_idx_array[limit:])  # type: ignore

        # insert n row
        n_row = pd.DataFrame(columns=['variable', 'value', 'Missing'])
        n_row = n_row.set_index(['variable', 'value'])
        n_row.loc['n', 'Missing'] = None

        # support pandas<=0.22
        try:
            table = pd.concat([n_row, table], sort=False)
        except TypeError:
            table = pd.concat([n_row, table])

        if self._groupbylvls == ['Overall']:
            table.loc['n', 'Overall'] = len(data.index)
        else:
            if self._overall:
                table.loc['n', 'Overall'] = len(data.index)
            for g in self._groupbylvls:
                ct = data[self._groupby][data[self._groupby] == g].count()
                table.loc['n', '{}'.format(g)] = ct

        # only display data in first level row
        dupe_mask = table.groupby(level=[0]).cumcount().ne(0)  # type: ignore
        dupe_columns = ['Missing']
        optional_columns = ['P-Value', 'P-Value (adjusted)', 'Test']
        if self._smd and self.smd_table is not None:
            optional_columns = optional_columns + list(self.smd_table.columns)
        for col in optional_columns:
            if col in table.columns.values:
                dupe_columns.append(col)

        table[dupe_columns] = table[dupe_columns].mask(dupe_mask).fillna('')

        # remove Missing column if not needed
        if not self._isnull:
            table = table.drop('Missing', axis=1)

        if self._pval and not self._pval_test_name:
            table = table.drop('Test', axis=1)

        # replace nans with empty strings
        table = table.fillna('')

        # add column index
        if not self._groupbylvls == ['Overall']:
            # rename groupby variable if requested
            c = self._groupby
            if self._alt_labels:
                if self._groupby in self._alt_labels:
                    c = self._alt_labels[self._groupby]

            c = 'Grouped by {}'.format(c)
            table.columns = pd.MultiIndex.from_product([[c], table.columns])

        # display alternative labels if assigned
        table = table.rename(index=self._create_row_labels(), level=0)

        # ensure the order of columns is consistent
        if self._groupby and self._order and (self._groupby in self._order):
            header = ['{}'.format(v) for v in table.columns.levels[1].values]  # type: ignore
            cols = self._order[self._groupby] + ['{}'.format(v)
                                                 for v in header
                                                 if v not in
                                                 self._order[self._groupby]]
        elif self._groupby:
            cols = ['{}'.format(v) for v in table.columns.levels[1].values]  # type: ignore
        else:
            cols = ['{}'.format(v) for v in table.columns.values]

        if self._groupby and self._overall:
            cols = ['Overall'] + [x for x in cols if x != 'Overall']

        if 'Missing' in cols:
            cols = ['Missing'] + [x for x in cols if x != 'Missing']

        # move optional_columns to the end of the dataframe
        for col in optional_columns:
            if col in cols:
                cols = [x for x in cols if x != col] + [col]

        if self._groupby:
            table = table.reindex(cols, axis=1, level=1)
        else:
            table = table.reindex(cols, axis=1)

        try:
            if 'Missing' in self._alt_labels or 'Overall' in self._alt_labels:  # type: ignore
                table = table.rename(columns=self._alt_labels)
        except TypeError:
            pass

        # remove the 'variable, value' column names in the index
        table = table.rename_axis([None, None])

        return table

    def _create_row_labels(self) -> dict:
        """
        Take the original labels for rows. Rename if alternative labels are
        provided. Append label suffix if label_suffix is True.

        Returns
        ----------
        labels : dictionary
            Dictionary, keys are original column name, values are final label.

        """
        # start with the original column names
        labels = {}
        for c in self._columns:
            labels[c] = c

        # replace column names with alternative names if provided
        if self._alt_labels:
            for k in self._alt_labels.keys():
                labels[k] = self._alt_labels[k]

        # append the label suffix
        if self._label_suffix:
            for k in labels.keys():
                if k in self._nonnormal:
                    if self._min_max and k in self._min_max:
                        labels[k] = "{}, {}".format(labels[k],
                                                    "median [min,max]")
                    else:
                        labels[k] = "{}, {}".format(labels[k],
                                                    "median [Q1,Q3]")
                elif k in self._categorical:
                    labels[k] = "{}, {}".format(labels[k], "n (%)")
                else:
                    if self._min_max and k in self._min_max:
                        labels[k] = "{}, {}".format(labels[k],
                                                    "mean [min,max]")
                    else:
                        labels[k] = "{}, {}".format(labels[k],
                                                    "mean (SD)")

        return labels



# Allow TableOne to be called as a function.
# Refactor this out at some point!
@docstring_copier(TableOne.__doc__)
def tableone(*args, **kwargs):
    """{0}"""
    return TableOne(*args, **kwargs)