Source code for statsmodels.datasets.cancer.data

"""Breast Cancer Data"""

__docformat__ = 'restructuredtext'

COPYRIGHT   = """???"""
TITLE       = """Breast Cancer Data"""
SOURCE      = """
This is the breast cancer data used in Owen's empirical likelihood.  It is taken from
Rice, J.A. Mathematical Statistics and Data Analysis.
http://www.cengage.com/statistics/discipline_content/dataLibrary.html
"""

DESCRSHORT  = """Breast Cancer and county population"""

DESCRLONG   = """The number of breast cancer observances in various counties"""

#suggested notes
NOTE        = """::

    Number of observations: 301
    Number of variables: 2
    Variable name definitions:

        cancer - The number of breast cancer observances
        population - The population of the county

"""

import numpy as np
from statsmodels.datasets import utils as du
from os.path import dirname, abspath

[docs]def load():
    """
    Load the data and return a Dataset class instance.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.
    """
    data = _get_data()
    ##### SET THE INDICES #####
    #NOTE: None for exog_idx is the complement of endog_idx
    return du.process_recarray(data, endog_idx=0, exog_idx=None, dtype=float)

[docs]def load_pandas():
    data = _get_data()
    ##### SET THE INDICES #####
    #NOTE: None for exog_idx is the complement of endog_idx
    return du.process_recarray_pandas(data, endog_idx=0, exog_idx=None,
                                      dtype=float)

def _get_data():
    filepath = dirname(abspath(__file__))
    ##### EDIT THE FOLLOWING TO POINT TO DatasetName.csv #####
    data = np.recfromtxt(open(filepath + '/cancer.csv', 'rb'),
            delimiter=",", names = True, dtype=float)
    return data