Source code for publicdata.census.dataframe

# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
# MIT License, included in this distribution as LICENSE
"""

"""

import six
import pandas as pd
import numpy as np

[docs]class CensusDataFrame(pd.DataFrame): _metadata = ['title_map', 'release', '_dataframe', '_url', 'table'] # Release is the Census Reporter release metadata def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False, schema=None, table=None, url=None): if columns is None and schema is not None: self.title_map = {s['code']: s['code_title'] for s in schema} columns = list( e.lower() for e in self.title_map.keys()) else: self.title_map = {} self._url = url self.table = table super(CensusDataFrame, self).__init__(data, index, columns, dtype, copy) for c in self.columns: self[c].title = self.title_map.get(c, c) @property def _constructor(self): return CensusDataFrame @property def _constructor_sliced(self): from .series import CensusSeries return CensusSeries @property def titles(self): """Return a copy that uses titles for column headings""" # There is a bug elsewhere that sometimes the columns are uppercase, and sometimes # they are lowercase. m = dict( list(self.title_map.items()) + [ (k.lower(), v) for k,v in self.title_map.items()]) return self.rename(index=str, columns=m, inplace=False) @property def mi(self): """Return a copy with a multiindex for the columns, with levels for table name, margin/estimate, column number, and race iteration""" return self
[docs] def search_columns(self, *args): """Return full titles for columns that contain one of the strings in the arguments :param args: String arguments, or compiled regular expressions """ import re def _f(): for a in args: for k, v in self.title_map.items(): if a.search(v) if isinstance(a, re._pattern_type) else a in str(v): yield (k, v) return pd.DataFrame(data=list(_f()), columns='code title'.split())
def _col_name_match(self, c, key): return (key == str(c['name']) or str(key).lower() == str(c['name']).lower() or key == c['code'] or key == str(c['code']).lower() or key == c['title'].lower() or key == str(c['index']).lower() or key == str(c['position'])) def _default_schema_entry(self, pos, c): """ Return a schema entry for columns that aren't ACS table columns :param pos: Position of the column :param c: Column name :return: """ return { 'name': c, 'title': c, 'code': c, 'code_title': c, 'indent': 0, 'index': None, 'position': pos } @property def rows(self): """Yield rows like a partition does, with a header first, then rows. """ yield [self.index.name] + list(self.columns) for t in self.itertuples(): yield list(t) @property def geoframe(self): """Return a geopandas dataframe with boundaries for the area""" return self._url.geoframe
[docs] def sum_m(self, *cols, inplace=False): """Sum a set of Dataframe series and return the summed series and margin. The series must have names""" # See the ACS General Handbook, Appendix A, "Calculating Margins of Error for Derived Estimates". # (https://www.census.gov/content/dam/Census/library/publications/2008/acs/ACSGeneralHandbook.pdf) # for a guide to these calculations. if len(cols) == 1 and isinstance(cols[0], (list, tuple)): cols = cols[0] cols = [self[c] for c in cols] estimates = sum(cols) margins = np.sqrt(sum(c.m90 ** 2 for c in cols)) return estimates, margins
[docs] def add_sum_m(self, col_name, *cols): """ Add new columns for the sum, plus error margins, for 2 or more other columns The routine will add two new columns, one named for col_name, and one for <col_name>_m90 :param col_name: The base name of the new column :param cols: :return: """ self[col_name], self[col_name + '_m90'] = self.sum_m(*cols) return self
[docs] def add_rse(self, *col_name): """ Create a new column, <col_name>_rse for Relative Standard Error, using <col_name> and <col_name>_m90 :param col_name: :return: """ for cn in col_name: self[cn + '_rse'] = self[cn].rse return self
[docs] def sum_col_range(self, first, last): """Sum a contiguous group of columns, and return the sum and the new margins. """ c1 = self[first] c2 = self[last] cols = self.ix[:, c1.col_position:c2.col_position + 1] estimates = sum(cols) margins = np.sqrt(np.sum(c.m90 ** 2 for c in cols)) return estimates, margins
[docs] def ratio(self, n, d): """ Calculate a ratio. The numerator should not be a subset of the denominator, such as the ratio of males to females. If it is a subset, use proportion(). :param n: The Numerator, a string, CensusSeries or tuple :param d: The Denominator, a string, CensusSeries or tuple :return: a tuple of series, the estimates and the margins """ return self._ratio(n, d, subset=False)
[docs] def proportion(self, n, d): """ Calculate a proportion. The numerator should be a subset of the denominator, such as the proportion of females to the total population. If it is not a subset, use ratio(). ( I think "subset" mostly means that the numerator < denominator ) :param n: The Numerator, a string, CensusSeries or tuple :param d: The Denominator, a string, CensusSeries or tuple :return: a tuple of series, the estimates and the margins """ return self._ratio(n, d, subset=True)
[docs] def normalize(self, x): """Convert any of the numerator and denominator forms into a consistent tuple form""" from .series import CensusSeries if isinstance(x, tuple): return self[x[0]], self[x[1]] elif isinstance(x, six.string_types): return self[x], self[x].m90 elif isinstance(x, CensusSeries): return x.value, x.m90 else: raise ValueError("Don't know what to do with a {}".format(type(x)))
def _ratio(self, n, d, subset=True): """ Compute a ratio of a numerator and denominator, propagating errors Both arguments may be one of: * A CensusSeries for the estimate * a string that can be resolved to a colum with .lookup() * A tuple of names that resolve with .lookup() In the tuple form, the first entry is the estimate and the second is the 90% margin :param n: The Numerator, a string, CensusSeries or tuple :param d: The Denominator, a string, CensusSeries or tuple :return: a tuple of series, the estimates and the margins """ n, n_m90 = self.normalize(n) d, d_m90 = self.normalize(d) rate = n.astype(float) / d.astype(float) if subset: try: # From external_documentation.acs_handbook, Appendix A, "Calculating MOEs for # Derived Proportions". This is for the case when the numerator is a subset of the # denominator # In the case of a neg arg to a square root, the acs_handbook recommends using the # method for "Calculating MOEs for Derived Ratios", where the numerator # is not a subset of the denominator. Since our numerator is a subset, the # handbook says " use the formula for derived ratios in the next section which # will provide a conservative estimate of the MOE." # The handbook says this case should be rare, but for this calculation, it # happens about 50% of the time. # Normal calc, from the handbook sqr = n_m90 ** 2 - ((rate ** 2) * (d_m90 ** 2)) # When the sqr value is <= 0, the sqrt will fail, so use the other calc in those cases sqrn = sqr.where(sqr > 0, n_m90 ** 2 + ((rate ** 2) * (d_m90 ** 2))) # Aw, hell, just punt. sqrnz = sqrn.where(sqrn > 0, float('nan')) rate_m = np.sqrt(sqrnz) / d except ValueError: return self._ratio(n, d, False) else: rate_m = np.sqrt(n_m90 ** 2 + ((rate ** 2) * (d_m90 ** 2))) / d return rate, rate_m
[docs] def product(self, a, b): a, a_m90 = self.normalize(a) b, b_m90 = self.normalize(b) p = a * b margin = np.sqrt(a ** 2 * b_m90 ** 2 + b ** 2 * a_m90 ** 2) return p, margin
[docs] def dim_columns(self, pred): """ Return a list of columns that have a particular value for age, sex and race_eth. The `pred` parameter is a string of python code which is evaled, with the classification dict as the local variable context, so the code string can access these variables: - sex - age - race-eth - col_num Col_num is the number in the last three digits of the column name Some examples of predicate strings: - "sex == 'male' and age != 'na' " :param pred: A string of python code that is executed to find column matches. """ from .dimensions import classify out_cols = [] for i, c in enumerate(self.partition.table.columns): if c.name.endswith('_m90'): continue if i < 9: continue cf = classify(c) cf['col_num'] = int(c.name[-3:]) if eval(pred, {}, cf): out_cols.append(c.name) return out_cols
def __getitem__(self, key): """ """ from pandas import DataFrame, Series from .series import CensusSeries result = super(CensusDataFrame, self).__getitem__(key) if isinstance(result, DataFrame): result.__class__ = CensusDataFrame result._dataframe = self elif isinstance(result, Series): result.__class__ = CensusSeries result._dataframe = self return result
[docs] def copy(self, deep=True): r = super(CensusDataFrame, self).copy(deep) r.__class__ = CensusDataFrame r.title_map = self.title_map return r
[docs] def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): r = super().set_index(keys, drop, append, inplace, verify_integrity) r = self if inplace else r return r
[docs] def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs): """ Overrides groupby() to return CensusDataFrameGroupBy """ from .groupby import groupby if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) return groupby(self, by=by, axis=axis, level=level, as_index=as_index, sort=sort, group_keys=group_keys, squeeze=squeeze, **kwargs)
[docs] def stacked(self, add_dimensions=False): """Return a plain Dataframe in a stacked format, with a column for the column name and one column for all margins and one for all estimates. Drops the name, county and and stusab because they get replicated once for each column. Filter on them before stacking. """ t = self.drop(columns=['stusab', 'name', 'county']) t = pd.DataFrame(t) t1 = t[[c for c in t.columns if '_m90' not in c]].stack().to_frame() t1.columns = ['estimate'] t2 = t[[c for c in t.columns if '_m90' in c]] t2.columns = [c.replace('_m90', '') for c in t2.columns] t2 = t2.stack().to_frame() t2.columns = ['margin'] t3 = t1.join(t2) t3.index.names = ['geoid', 'column'] if add_dimensions: sex = pd.DataFrame([(c.unique_id.lower(), c.sex) for c in self.table.columns], columns=['column', 'sex']).set_index('column') age = pd.DataFrame([(c.unique_id.lower(), c.age) for c in self.table.columns], columns=['column', 'age']).set_index('column') raceeth = pd.DataFrame([(c.unique_id.lower(), c.raceeth) for c in self.table.columns], columns=['column', 'raceeth']).set_index('column') pov = pd.DataFrame([(c.unique_id.lower(), c.poverty_status) for c in self.table.columns], columns=['column', 'poverty_status']).set_index('column') t4 = t3.join(sex).join(age).join(raceeth).join(pov).reset_index() # Move the margin and estimate columns to the end return t4[list(c for c in t4.columns if c not in ['estimate', 'margin']) + ['estimate', 'margin']] else: return t3
## ## Extension Points ## @property def _constructor(self): return CensusDataFrame @property def _constructor_sliced(self): from .series import CensusSeries return CensusSeries def _getitem_column(self, key): """ Return a column from a name :param key: :return: """ c = super(CensusDataFrame, self)._getitem_column(key) c.parent_frame = self c.title = self.title_map.get(key, key) return c def _getitem_array(self, key): """Return a set of columns. The keys can be any of the names for the column, the method automatically adds _m90 columns""" if isinstance(key, list): # augmented_key is the original list of columns with the _m90 columns added augmented_key = [] for col_name in key: augmented_key.append(col_name) try: self[col_name + '_m90'] augmented_key.append(col_name + '_m90') except KeyError: pass df = super()._getitem_array(augmented_key) else: df = super()._getitem_array(key) assert id(df) != id(self) return df