Fetch Census stats for each U.S. Census Tract

First, we import packages needed to fetch Census statistics for each U.S. Census Tract. A combination of:

  • sys,
  • os,
  • json,
  • io,
  • urllib.request,
  • and csv is used to both process the Census API call and ultimately output to csv.

nb. There is no tract data for American Samoa (FIPS 60), Guam (FIPS 66), Northern Mariana Islands (FIPS 69), and Virgin Islands (FIPS 78)? So I'm skipping those state-equivalents from analysis.

In [ ]:
import sys, os, json, io, urllib.request, csv

Next, we declare variables needed prior to the actual call to the Census API.

In [ ]:
# 2010-2014 American Community Survey 5-Year Estimates
dataset = "http://api.census.gov/data/2014/acs5"

api_key = os.environ["API_KEY"]
unit = sys.argv[1] # "tract" or "county"

fields = {
	"B01003_001E": "population",
	"B02001_002E": "white", # white alone
	
  "B19013_001E": "median_income", # Median Household Income in the Past 12 Months (in 2014 Inflation-Adjusted Dollars) 
  "B17001_001E": "poverty_status_denominator", # Poverty Status in the past 12 Months.
  "B17001_002E": "in_poverty", # Income in the past 12 months below poverty level
  
  "B08006_001E": "all_workers", # Means of Transportation to Work - total population that works?
  "B08006_008E": "public_transit", # Public transportation (excluding taxicab)
  "B08006_014E": "bike", # Bicycle
  "B08006_015E": "walked", # Walked


  "B12001_011E": "all_female_marital", # Female Total - Sex by Marital Status for the Population 15 Years and over
  "B12001_013E": "female_married", # Female:!!Now married

  "B25024_001E": "housing_units", # Units in Structure (Housing Units?)
  "B25024_002E": "housing_units_single_detached",
  "B25024_003E": "housing_units_single_attached",
}

metadata = ["state", "county"]
if unit == "tract": metadata += ["tract"]

all_states = ['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16',
              '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
              '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42',
              '44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56', '72']

Now with that out of the way, we define a function that loads the data for all tracts in a given state and attaches it to a FIPS code.

In [ ]:
def get_data_for_state(state):
  # Load the data for all tracts in the given state, passed as a FIPS code.

  try:
  	# Form the URL.
    url = dataset + "?key=%s&get=NAME,%s&for=%s&in=state:%s" \
      % (api_key, ",".join(fields), unit, state)

    print(state + "...", file=sys.stderr)

    # Make HTTP request and load JSON response.
    data = json.load(io.TextIOWrapper(urllib.request.urlopen(url)))
    header = data[0]
    data = data[1:]
 
    # Return as a list of dicts, e.g.:
    # { 'state': '05', 'county': '145', 'tract': '070500', 'P0010001': '6300'}
    for row in data:
    	item = dict(zip(header, row))
    	yield item

  except Exception as e:
  	raise Exception("Error %s loading tract data for state %s at %s." % (str(e), state, url))

Now, finally we output the result to a CSV file.

In [ ]:
# Loop over states, query the Census API for each state, and write stats to a CSV file.
w = csv.writer(sys.stdout)
cols = metadata + sorted(fields)
w.writerow([fields.get(c, c) for c in cols])
for state in all_states:
  for tract in get_data_for_state(state):
    w.writerow([tract.get(f) for f in cols])

Computing Pixels

Again we import the packages needed for computing pixels first. These are:

  • sys,
  • glob,
  • csv,
  • collections,
  • shapefile,
  • tqdm,
  • pyproj,
  • and PIL. These packages allow us to load in shapefile geography, define the coordinate system, and ultimately project to a raster to determine pixel size.
In [ ]:
import sys
import glob
import csv
from collections import defaultdict

import shapefile
import tqdm
import pyproj
from PIL import Image, ImageDraw, ImageFont

Next we declare variables needed prior to the actual projection to raster.

In [ ]:
##########################################

unit = sys.argv[1] # "tract" or "county"
width = int(sys.argv[2])

##########################################

contiguous_us = ("01", "04", "05", "06", "08", "09", "10", "11", "12", "13", "16",
	             "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27",
	             "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38",
	             "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50",
	             "51", "53", "54", "55", "56", )

# Define map projections (lat/lng and a nice one for contiguous US)
p_latlng = pyproj.Proj(init='EPSG:4326')
p_mapproj = pyproj.Proj("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=clrk66 +units=m +no_defs") # http://spatialreference.org/ref/sr-org/7271/

# This is the bounding box of the projected map coordinates.
projected_bounds = [(-2387000, 254700), (2263000, 3169000)]

# Raster dimensions.
height = int(round(width * (projected_bounds[1][1] - projected_bounds[0][1]) / (projected_bounds[1][0] - projected_bounds[0][0])))

Now we define our helper function to project into map coordinates in [0,1]. Next, load the shapefiles so we can get a count of all shapes and store which geographic units occur in which pixels.

In [ ]:
# Helper function to project into map coordinates in [0,1].
def project(lng, lat):
	# Project.
	lng, lat = pyproj.transform(p_latlng, p_mapproj, lng, lat)

	# Scale to [0, 1].
	lng = (lng-projected_bounds[0][0]) / (projected_bounds[1][0] - projected_bounds[0][0])
	lat = (lat-projected_bounds[0][1]) / (projected_bounds[1][1] - projected_bounds[0][1])

	return (lng, lat)

# load shapefiles first so we can get a count of all shapes
total_shapes = 0
shapefiles = []
for fn in glob.glob("shapefiles_%s/*.shp" % unit):
	sf = shapefile.Reader(fn)
	shapefiles.append(sf)
	total_shapes += sf.numRecords

# iterator over all files
def iter_all_shapes():
	for sf in shapefiles:
		fields = [f[0] for f in sf.fields[1:]]
		for i, shape in enumerate(sf.iterShapes()):
			  # note: DeletionFlag is first in sf.fields but is not in sf.record(i)
			yield (sf, shape, dict(zip(fields, sf.record(i))))

# Store which geographic units occur in which pixels.
pixels = defaultdict(lambda : [])

Finally, we draw combined map and by iterating though all the shapes we've loaded, count how many pixels are contained within each shape.

In [ ]:
# Combined map.
img_all = Image.new("L", (width, height), 0)
draw_all = ImageDraw.Draw(img_all)

# iterate through all of the shapes
for sf, shape, fields in \
	tqdm.tqdm(iter_all_shapes(), total=total_shapes, file=sys.stderr, leave=True):
	assert shape.shapeType == 5

	# Because we're making a statement about how the units appears on a map,
	# project the lat/lng coordinates into a standard US map projection.
	# Skip the states that aren't in the contiguous U.S., since far-away
	# places will come out distorted in a projection meant for the contiguous
	# U.S.
	if fields["STATEFP"] not in contiguous_us:
		continue

	geo_id = fields["GEOID"]
	# fields["ALAND"] is land (excluding water) area as computed by the Census,
	# which is also interesting.

	# Project points to map coordinates.
	polygon = shape.points
	polygon = [project(lng, lat) for (lng, lat) in polygon]

	# Scale to image coordinates.
	polygon = [(lng*width, height - lat*height) for (lng, lat) in polygon]

	# Create an empty greyscale image of a certain pixel size to rasterize the tract.
	img = Image.new("L", (width, height), 0)
	draw = ImageDraw.Draw(img)

	# Draw the shape.
	draw.polygon(polygon, fill=255)
	draw_all.polygon(polygon, fill=255)

	# We could count up the number of pixels that are now turned on
	# by calling img.histogram(), which is really fast. But because
	# at regular map resolutions some tracts will tend to fall on the same
	# pixels, we'll record which geographic units end up on which pixels.
	# img.getdata() returns a flat array of the pixels in raster order,
	# by row. By enumerating, each pixel gets a consistent integer identifier.
	is_drawn = False
	for pixel, value in enumerate(img.getdata()):
		if value > 0: # really just 255 so long as there is no antialiasing
			if value != 255: raise Exception("encountered a pixel value that's not 0 or 255")
			pixels[pixel].append(geo_id)
			is_drawn = True

	if not is_drawn:
		# Shape was too small to be represented in the raster image at all.
		# That means it's smaller than one pixel. All of the points on the
		# polygon will probably round to the same coordinate. But to be
		# sure, take the average of the coordinates and mark the shape
		# as being drawn at that one lump location.
		x = int(round(sum(latlng[0] for latlng in polygon)/len(polygon)))
		y = int(round(sum(latlng[1] for latlng in polygon)/len(polygon)))
		pixel = y*width + x
		pixels[pixel].append(geo_id)

	# Release object.
	del draw

# Save the image that has all of the shapes drawn, to verify that we're
# drawing the right thing.
del draw_all
img_all.save("map_%s_%d.png" % (unit, width), format="png")

To save the pixel count for each shape, we again output to CSV.

In [ ]:
# Write out each pixel that got lit up at all and the geographic unit IDs that did so.
import csv
w = csv.writer(sys.stdout)
for pixel_id, geoid_list in sorted(pixels.items()):
	x = (pixel_id % width)
	y = pixel_id // width
	w.writerow([x, y] + geoid_list)