Source code for publicdata.census.api.censusapi

# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
# MIT License, included in this distribution as LICENSE
"""
Access the Census API and create Pandas dataframes, with support for IPython and Jupyter display
"""

import json
from collections import UserList, UserDict
from textwrap import fill

import requests
from terminaltables import AsciiTable as TermTable
from publicdata.census.util import nl2br, slugify
from rowgenerators import get_cache
from publicdata.census.censusreporter.exceptions import AccessException

def _cached_get(url, cache=True):
    """Return the results of a GET request, possibly cached.
    Assumes the response is JSON"""

    cache_fs = get_cache()

    cache_key = slugify(url)

    if cache and cache_fs.exists(cache_key):
        data = json.loads(cache_fs.gettext(cache_key))
    else:
        try:
            r = requests.get(url)
            data = r.json()
            r.raise_for_status()
        except:
            raise AccessException("ERROR "+r.text)


        if cache:
            cache_fs.settext(cache_key, json.dumps(data, indent=4))

    return data


[docs]class VariableMeta(UserDict): def __init__(self, dict=None, **kwargs): super().__init__(dict, **kwargs)
[docs]class VariableList(UserList): """List container for dataset results""" def __init__(self, initlist=None): super().__init__(initlist) def _table_data(self): return ['Name Label Concept Type Required'.split()] + \ [[e.get('name'), e.get('label'), e.get('concept'), e.get('predicateType', ''), e.get('required', '')] for e in sorted(self.data, key=lambda e: e.get('name'))] def __str__(self): table = TermTable(self._table_data()) table.inner_row_border = True return table.table def _repr_html_(self): data = self._table_data() def make_row(cells, tag='td'): return "<tr>{}</tr>".format(''.join("<{1}>{0}</{1}>".format(nl2br(str(c)), tag) for c in cells)) return "<table>\n" + make_row(data[0],tag='th') +\ ''.join(make_row(cells) for cells in data[1:]) + "</table>" def _repr_pretty_(self, p, cycle): """Default pretty printer """ if cycle: p.text(self.__class__.__name__ + "(...)") else: p.text(str(self))
[docs]class DatasetMeta(UserDict): """Container for Dataset Metadata and an access API""" def __init__(self, dict=None, **kwargs): super().__init__(dict, **kwargs) @property def access_url(self): for d in self.get('distribution'): if d.get('format') == 'API': return d.get('accessURL') @property def id(self): return self.get('identifier', '').replace('http://api.census.gov/data/id/', '') @property def variables_meta(self): return _cached_get(self.c_variablesLink) @property def variables(self): return VariableList(sorted( [VariableMeta(dict([('name', k)] + list(v.items()))) for k, v in self.variables_meta.get('variables', []).items()], key=lambda x: x.get('name', '').lower() )) def _search_variables(self, *args, **kwargs): import re for variable in sorted(self.variables, key=lambda d: d.get('label')): text = variable.get('name', '') + ' ' + variable.get('label', '') + ' ' + variable.get('concept', ' ') if any(a.search(text.lower()) if isinstance(a, re._pattern_type) else a.lower() in str(text.lower()) for a in args): yield variable continue for k, v in kwargs.items(): if isinstance(v, re._pattern_type) and v.search(variable.get(k, '')): yield variable break elif v in variable.get(k, ''): yield variable break
[docs] def search_variables(self, *args, **kwargs): return VariableList(self._search_variables(*args, **kwargs))
def __getattr__(self, item): return self[item]
[docs] def fetch_url(self, *get, geo_for=None, geo_in=None, **predicates): from six.moves.urllib.parse import urlencode, quote_plus d = dict( get=','.join(quote_plus(e) for e in get) ) if geo_for: d['for'] = geo_for if geo_in: d['in'] = geo_in for k, v in predicates.items(): d[k] = v return self.access_url+"?"+urlencode(d)
[docs] def fetch(self, *get, geo_for=None, geo_in=None, cache=True, **predicates ): url = self.fetch_url(*get, geo_for=geo_for, geo_in=geo_in, **predicates) return _cached_get(url, cache=cache)
[docs] def fetch_dataframe(self, *get, geo_for=None, geo_in=None, cache=True, **predicates): import pandas d = self.fetch(*get, geo_for=geo_for, geo_in=geo_in, cache=cache, **predicates) return pandas.DataFrame(d[1:], columns=d[0])
def _table_data(self): from textwrap import fill return [ ['title', self.title], ['identitfier', self.identifier], ['description', fill(self.description, 75)], ['vintage', self.get('c_vintage')], ['Access Url', self.access_url], ['Geographies', self.get('c_geographyLink','').replace('.json','.html')], ['Variables', self.get('c_variablesLink', '').replace('.json', '.html')], ['Examples', self.get('c_examplesLink', '').replace('.json', '.html')], ] def __str__(self): table = TermTable(self._table_data()) table.inner_row_border = False table.title = "Dataset " + self.id return table.table def _repr_html_(self): """Display routine for IPython""" data = self._table_data() def make_row(cells, tag='td'): return "<tr>{}</tr>".format(''.join("<{1}>{0}</{1}>".format(c, tag) for c in cells)) return "<table>\n" + '\n'.join(make_row(cells) for cells in data) + "</table>" def _repr_pretty_(self, p, cycle): """Default pretty printer """ if cycle: p.text(self.__class__.__name__ + "(...)") else: p.text(str(self))
[docs]class DatasetList(UserList): """List container for dataset results""" def __init__(self, initlist=None): super().__init__(initlist) @property def titles(self): """Return only the titles from the results""" return [e.get('title') for e in self.data] def _table_data(self): return ['Title Description '.split()] + \ [[e.id + '\n' + fill(e.get('title', ''), 18), fill(e.get('description', ''), 60)] for e in self.data] def __str__(self): data = self._table_data() table = TermTable(data) table.inner_row_border = True return table.table def _repr_html_(self): data = self._table_data() def make_row(cells, tag='td'): return "<tr>{}</tr>".format(''.join("<{1}>{0}</{1}>".format(nl2br(str(c)), tag) for c in cells)) return "<table>\n" + make_row(data[0], tag='th') + \ ''.join(make_row(cells) for cells in data[1:]) + "</table>" def _repr_pretty_(self, p, cycle): """Default pretty printer """ if cycle: p.text(self.__class__.__name__ + "(...)") else: p.text(str(self))
[docs]class CensusApi(object): def __init__(self): pass @property def metadata(self): return self._metadata(cache=True) def _metadata(self, cache=True): """Return the API metadata""" url = "https://api.census.gov/data.json" return _cached_get(url, cache=cache) def _datasets(self): for _dataset in sorted(self.metadata.get('dataset', []), key=lambda d: d.get('title')): yield DatasetMeta(_dataset) @property def datasets(self): return DatasetList(self._datasets()) def _search_datasets(self, *args, **kwds): import re for _dataset in sorted(self.metadata.get('dataset', []), key=lambda d: d.get('title')): dataset = DatasetMeta(_dataset) text = dataset.id + ' ' + \ dataset.get('title', '') + ' ' + \ dataset.get('description', ' ') + \ (' '.join(dataset.get('keyword', [])) + ' ' + str(dataset.get('vintage', ' '))) if any(a.search(text.lower()) if isinstance(a, re._pattern_type) else a.lower() in str(text.lower()) for a in args): yield dataset continue for k, v in kwds.items(): if isinstance(v, re._pattern_type) and v.search(dataset.get(k, '')): yield dataset break elif v in dataset.get(k, ''): yield dataset break
[docs] def search_datasets(self, *args, **kwds): return DatasetList(self._search_datasets(*args, **kwds))
[docs] def get_dataset(self, id): """Return a dataset given its id or identifier""" parts = id.split('/') ident = parts[-1] for d in self.metadata.get('dataset', []): if d.get('identifier', '').endswith('/' + ident): return DatasetMeta(d)