gdutils.dataqa

dataqa is a module in package gdutils that provides functions used in data quality assurance.


Examples Setup

[1]:
# Install ``gdutils`` package
!conda install fiona shapely pyproj rtree && pip3 install wheel
!pip3 install git+https://github.com/mggg/gdutils.git > /dev/null
[2]:
import gdutils.dataqa as dq # imports the ``dataqa`` module

import geopandas as gpd
import pandas as pd
import json

import gdutils.datamine as dm
import gdutils.extract as et
[3]:
# Gather example datasets
dm.clone_gh_repos('mggg-states', 'orgs', ['AK-shapefiles'])
dm.clone_gh_repos('MEDSL', 'users', ['official-precinct-returns'])
    # ^ this is a very large dataset -- will take some time to clone
[4]:
# List available datasets to use
[5]:
dm.list_files_of_type('.zip', 'AK-shapefiles/')
[5]:
['AK-shapefiles/AK_precincts.zip']
[ ]:
dm.list_files_of_type('.zip', 'official-precinct-returns/')
    # output not displayed since it takes up too much screen space
[7]:
# Extract applicable data subsets
[8]:
mggg_gdf = et.read_file('AK-shapefiles/AK_precincts.zip', column='NAME').extract()
[ ]:
medsl_et = et.read_file('official-precinct-returns/2016-precinct-president/2016-precinct-president.zip',
                        column='state', value='Alaska')
medsl_gdf = medsl_et.extract()
[10]:
# Data-wrangle MEDSL data
medsl_pvt = medsl_gdf.pivot_table(index='precinct',
                                  columns=['office', 'party'],
                                  values='votes')
medsl_pvt.columns = [' '.join(col).strip() for col in medsl_pvt.columns.values]
medsl_gdf = et.ExtractTable(medsl_pvt).extract()
[11]:
# Load and generate naming convention standards
with open('naming_convention.json') as json_file:
    standards_raw = json.load(json_file)

offices = dm.get_keys_by_category(standards_raw, 'offices')
parties = dm.get_keys_by_category(standards_raw, 'parties')
counts = dm.get_keys_by_category(standards_raw, 'counts')
others = dm.get_keys_by_category(standards_raw,
            ['geographies', 'demographics', 'districts', 'other'])

elections = [office + format(year, '02') + party
                 for office in offices
                 for year in range(0, 21)
                 for party in parties
                 if not (office == 'PRES' and year % 4 != 0)]

counts = [count + format(year, '02') for count in counts
                                     for year in range(0, 20)]

standards = elections + counts + others

Example 1. Compare column names against a standard

[12]:
# Ex. 1.

(in_standards, not_in_standards) = dq.compare_column_names(mggg_gdf, standards)
[13]:
in_standards # renders set of column names in mggg_gdf that fit the standards
[13]:
{'2MOREVAP',
 'AMINVAP',
 'ASIANVAP',
 'BVAP',
 'GOV18D',
 'GOV18L',
 'GOV18R',
 'HDIST',
 'NHPIVAP',
 'OTHERVAP',
 'PRES16D',
 'PRES16G',
 'PRES16L',
 'PRES16R',
 'SEN16D',
 'SEN16L',
 'SEN16R',
 'TOTPOP',
 'USH14D',
 'USH14L',
 'USH14R',
 'USH16D',
 'USH16L',
 'USH16R',
 'USH18D',
 'USH18R',
 'VAP',
 'WVAP',
 'geometry'}
[14]:
not_in_standards # renders set of column names in mggg_gdf that don't fit the standards
[14]:
{'2MORE',
 'AMIN',
 'AREA',
 'ASIAN',
 'BLACK',
 'DISTRICT',
 'ID',
 'NHPI',
 'OTHER',
 'POPULATION',
 'PRES16C',
 'WHITE'}

Example 2. Aggregate column values

Example 2.1. Sum the values of one column

[15]:
# Ex. 2.1

aggregates = dq.sum_column_values(mggg_gdf, ['PRES16D'])
aggregates
[15]:
[('PRES16D', 69097)]

Example 2.1. Sum the values of multiple columns

[16]:
# Ex. 2.2.
columns_to_aggregate = ['PRES16D', 'PRES16G', 'PRES16R', 'PRES16L']
aggregates = dq.sum_column_values(mggg_gdf, columns_to_aggregate)
aggregates
[16]:
[('PRES16D', 69097),
 ('PRES16G', 3782),
 ('PRES16L', 12004),
 ('PRES16R', 103457)]
[17]:
# Print results in a pretty format
print('Column name : Sum of column values')
print('----------------------------------')

for aggregate in aggregates:
    (column_name, column_sum) = aggregate
    print('{:11} : {}'.format(column_name, column_sum))
Column name : Sum of column values
----------------------------------
PRES16D     : 69097
PRES16G     : 3782
PRES16L     : 12004
PRES16R     : 103457

Example 3. Compare every value of a column against every value of another column

[18]:
# Setup sample tables for Example 3
df1 = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]],
                   columns=['COL1', 'COL2', 'COL3'])
df2 = pd.DataFrame(data=[[4, 5], [1, 2]], columns=['col2', 'col1'])

Example 3.1. Compare one column from one tables against one column from another table

[19]:
# Ex. 3.1.

results = dq.compare_column_values(df1, df2, ['COL3'], ['col2'])
results
[19]:
{'COL3 [vs] col2': [('0 [vs] 0', -1), ('1 [vs] 1', 5)]}

Example 3.2. Compare multiple columns from two tables against each other

[20]:
# Ex. 3.2.

results = dq.compare_column_values(df1, df2,
                                   ['COL1', 'COL2'],
                                   ['col1', 'col2'])
results
[20]:
{'COL1 [vs] col1': [('0 [vs] 0', -4), ('1 [vs] 1', 2)],
 'COL2 [vs] col2': [('0 [vs] 0', -2), ('1 [vs] 1', 4)]}
[21]:
# Print results in a pretty format
for column_to_column in results:
    print('{} ========'.format(column_to_column))
    for row_to_row, difference in results[column_to_column]:
        print('{:3} : {}'.format(row_to_row, difference))
COL1 [vs] col1 ========
0 [vs] 0 : -4
1 [vs] 1 : 2
COL2 [vs] col2 ========
0 [vs] 0 : -2
1 [vs] 1 : 4

Example 3.3. Compare a row in a column against a specific row in another column

[22]:
# Ex. 3.3.
results = dq.compare_column_values(mggg_gdf, medsl_gdf,
                                   ['PRES16D'], ['US President democratic'],
                                   ['01-446 AURORA'], ['01-446 Aurora'])
results
[22]:
{'PRES16D [vs] US President democratic': [('01-446 AURORA [vs] 01-446 Aurora',
   0.0)]}

Example 3.4. Compare specific rows in a column against specific rows in another column

[23]:
# Ex. 3.4.
results = dq.compare_column_values(mggg_gdf, medsl_gdf,
                                   ['PRES16D'], ['US President democratic'],
                                   ['01-446 AURORA', '01-455 FAIRBANKS NO. 1', '01-465 FAIRBANKS NO. 2'],
                                   ['01-446 Aurora', '01-455 Fairbanks No. 1', '01-465 Fairbanks No. 2'])
results
[23]:
{'PRES16D [vs] US President democratic': [('01-446 AURORA [vs] 01-446 Aurora',
   0.0),
  ('01-455 FAIRBANKS NO. 1 [vs] 01-455 Fairbanks No. 1', 0.0),
  ('01-465 FAIRBANKS NO. 2 [vs] 01-465 Fairbanks No. 2', 0.0)]}
[24]:
# Print results in a pretty format
for column_to_column in results:
    print('{} ========'.format(column_to_column))
    for row_to_row, difference in results[column_to_column]:
        print('{:50} : {}'.format(row_to_row, difference))
PRES16D [vs] US President democratic ========
01-446 AURORA [vs] 01-446 Aurora                   : 0.0
01-455 FAIRBANKS NO. 1 [vs] 01-455 Fairbanks No. 1 : 0.0
01-465 FAIRBANKS NO. 2 [vs] 01-465 Fairbanks No. 2 : 0.0

Example 4. Compare column aggregates

4.1. Compare one column’s aggregates against another column’s aggregates

[25]:
# Ex. 4.1

results = dq.compare_column_sums(mggg_gdf, medsl_gdf,
                                 ['PRES16D'], ['US President democratic'])
results
[25]:
[('PRES16D [vs] US President democratic', -47220.0)]

4.2. Compare multiple column aggregates against multiple column aggregates

[26]:
# Ex. 4.2.
mggg_columns = ['PRES16D', 'PRES16G', 'PRES16R', 'PRES16L']
medsl_columns = ['US President democratic', 'US President green',
                 'US President republican', 'US President libertarian']

results = dq.compare_column_sums(mggg_gdf, medsl_gdf,
                                 mggg_columns, medsl_columns)
results
[26]:
[('PRES16D [vs] US President democratic', -47220.0),
 ('PRES16G [vs] US President green', -1947.5),
 ('PRES16L [vs] US President libertarian', -6717.5),
 ('PRES16R [vs] US President republican', -59910.0)]
[27]:
# Print results in a pretty format
print('Column [vs] Column : Difference of column sums')
print('----------------------------------------------')

for result in results:
    (col_v_col, diff) = result
    print('{:40} : {}'.format(col_v_col, diff))
Column [vs] Column : Difference of column sums
----------------------------------------------
PRES16D [vs] US President democratic     : -47220.0
PRES16G [vs] US President green          : -1947.5
PRES16L [vs] US President libertarian    : -6717.5
PRES16R [vs] US President republican     : -59910.0

Examples Cleanup

The following commands are used to reset and clean up the examples above.

[28]:
# Remove cloned repos
dm.remove_repos('.')
[ ]:
# Remove outputs
!rm -r outputs
[ ]:
# Uninstall Package
!echo y | pip uninstall gdutils
[ ]:
# Reset Jupyter Notebook IPython Kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")