Commit 644b62aa authored by Tuccillo, Joe's avatar Tuccillo, Joe
Browse files

migrate files and update README

parent 81d571f0
# pymedm
Penalized Maximum-Entropy Dasymetric Modeling (P-MEDM) in Python
\ No newline at end of file
Penalized Maximum-Entropy Dasymetric Modeling (P-MEDM) in Python
This is a _Work in Progress_ Python port of [PMEDMrcpp](https://bitbucket.org/nnnagle/pmedmrcpp/src/master).
#### References
1. Leyk, S., Nagle, N. N., & Buttenfield, B. P. (2013). Maximum entropy dasymetric modeling for demographic small area estimation. Geographical Analysis, 45(3), 285-306.
2. Nagle, N. N., Buttenfield, B. P., Leyk, S., & Spielman, S. (2014). Dasymetric modeling and uncertainty. Annals of the Association of American Geographers, 104(1), 80-95.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 29 13:46:41 2021
@author: joe
"""
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.optimize import minimize
class pmedm:
"""P-MEDM class.
Attributes:
serial: Response IDs.
wt: Sample weights.
N: Total population.
n: Total responses.
pX: Individual-level constraints.
Y1: Level-1 (aggregate) constraints.
Y2: Level-2 (target) constraints.
Y_vec: Vectorized constraints.
V1: Level-1 (aggregate) error variances.
V2: Level-2 (target) error variances.
V_vec: Vectorized error variances.
sV: Diagonal matrix of error variances.
topo: Geographic crosswalk.
A1: Geographic crosswalk (np-array).
A2: Level-2 (target) identity matrix.
X: Solution space.
q: Occurrence probabilities.
lam: Parameters.
"""
def __init__(self, serial, wt, cind, cg1, cg2, sg1, sg2, q = None,\
lam = None, verbose = True, keep_solver = True):
"""
Parameters
----------
serial : TYPE
DESCRIPTION.
wt : TYPE
DESCRIPTION.
cind : TYPE
DESCRIPTION.
cg1 : TYPE
DESCRIPTION.
cg2 : TYPE
DESCRIPTION.
sg1 : TYPE
DESCRIPTION.
sg2 : TYPE
DESCRIPTION.
q : TYPE, optional
DESCRIPTION. The default is None.
lam : TYPE, optional
DESCRIPTION. The default is None.
Returns
-------
None.
"""
# Response IDs
self.serial = serial
# Sample weights
self.wt = wt
# Total population
self.N = np.sum(self.wt)
# Total responses
self.n = cind.shape[0]
# Individual constraints
self.pX = cind.values
# Geographic constraints
self.Y1 = cg1.values
self.Y2 = cg2.values
self.Y_vec = np.concatenate([self.Y1.flatten('A'), self.Y2.flatten('A')]) / self.N
# Geographic constraint error variances
self.V1 = np.square(sg1.astype('float').values)
self.V2 = np.square(sg2.astype('float').values)
self.V_vec = np.concatenate([self.V1.flatten('A'), self.V2.flatten('A')])\
* (self.n / self.N**2)
self.sV = np.diag(self.V_vec)
# Geographic topology
self.topo = pd.DataFrame({'g1': cg2.index.values,\
'g2': [str(i)[0] for i in cg2.index.values]})
self.A1 = np.array([1 * (self.topo.g2.values == G)\
for G in np.unique(self.topo.g2.values)])
self.A2 = np.identity(self.topo.shape[0])
# Solution space
X1 = np.kron(np.transpose(self.pX), self.A1)
X2 = np.kron(np.transpose(self.pX), self.A2)
self.X = np.transpose(np.vstack((X1, X2)))
# Initial probabilities
if q is None:
self.q = np.repeat(wt, self.A1.shape[1], axis = 0)
self.q = self.q / np.sum(self.q)
# Initial parameters
if lam is None:
self.lam = np.zeros((len(self.Y_vec),))
# solver gradient
self.grad = None
# solver hessian
self.hess = None
# for storing solver results
self.res = None
# print output?
self.verbose = verbose
# keep P-MEDM solver results?
self.keep_solver = keep_solver
@staticmethod
def f(lam, q, X, Y_vec, sV):
"""Objective function
"""
qXl = q * np.exp(np.matmul(-X, lam))
lvl = lam.dot(np.matmul(sV, lam))
F = Y_vec.dot(lam) + np.log(np.sum(qXl)) + (0.5 * lvl)
return F
@staticmethod
def g(lam, q, X, Y_vec, sV):
"""Gradient function
"""
qXl = q * np.exp(np.matmul(-X, lam))
p = qXl / np.sum(qXl)
Xp = np.matmul(np.transpose(X), p)
G = Y_vec + np.matmul(sV, lam) - Xp
return G
@staticmethod
def h(lam, q, X, Y_vec, sV):
"""Hessian function
"""
qXl = q * np.exp(np.matmul(-X, lam))
p = qXl / np.sum(qXl)
# first term
h1a = np.matmul(np.transpose(X), p)
h1b = p.dot(X)
h1 = h1a[:,None] * h1b
# second term
dp = sparse.csr_matrix(np.diag(p))
h2 = np.matmul(X.transpose(), dp.dot(X))
H = -h1 + h2 + sV
return H
def update_probabilities(self, lam):
"""Updates occurrence probabilities
"""
qXl = self.q * np.exp(np.matmul(-self.X, lam))
self.q = qXl / np.sum(qXl)
def solve(self):
minim = minimize(method = 'trust-krylov',\
fun = self.f, x0 = self.lam, jac = self.g, hess = self.h,\
args=(self.q, self.X, self.Y_vec, self.sV),\
options = {'disp' : self.verbose, 'return_all': True})
# updates
self.lam = minim.x
self.update_probabilities(self.lam)
self.grad = minim.jac
self.hess = minim.hess
if self.keep_solver:
self.res = minim
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 1 07:28:57 2021
@author: joe
"""
#%%
# Total population
N = np.sum(wt)
#%%
#%%
# Total responses
n = cind.shape[0]
#%%
#%%
# Individual constraints
pX = cind.values
#%%
#%%
# Geographic constraints
Y1 = ctrt.values
Y2 = cbg.values
Y_vec = np.concatenate([Y1.flatten('A'), Y2.flatten('A')]) / N
#%%
#%%
# Geographic constraint error variances
V1 = np.square(strt.astype('float').values)
V2 = np.square(sbg.astype('float').values)
V_vec = np.concatenate([V1.flatten('A'), V2.flatten('A')])\
* (n / N**2)
#%%
#%%
sV = np.diag(V_vec)
#%%
#%%
# Geographic topology
topo = pd.DataFrame({'g2': cbg.index.values,\
'g1': [str(i)[0] for i in cbg.index.values]})
#%%
#%%
A1 = np.array([1 * (topo.g1.values == G)\
for G in np.unique(topo.g1.values)])
#%%
#%%
A2 = np.identity(topo.shape[0])
#%%
# Solution space
X1 = np.kron(np.transpose(pX), A1)
X2 = np.kron(np.transpose(pX), A2)
X = np.transpose(np.vstack((X1, X2)))
# Prior probabilities
if q is None:
q = np.repeat(wt, A1.shape[1], axis = 0)
q = q / np.sum(q)
# Initial parameters
if lam is None:
lam = np.zeros((len(Y_vec),))
\ No newline at end of file
"","GEOID","POP","CONST1","CONST2","CONST3","POPs","CONST1s","CONST2s","CONST3s"
"1","10",344,152,101,175,9.4339811320566,7.81024967590665,12.328828005938,11.1803398874989
"2","20",260,84,106,107,4.24264068711928,7.61577310586391,8.24621125123532,7.81024967590665
"3","30",682,200,242,293,8.36660026534076,12.4899959967968,14.3527000944073,10
GEOID,POP,POPs,CONST1,CONST1s,CONST2,CONST2s,CONST3,CONST3s
101,112,6,78,6,12,6,23,3
102,122,7,47,3,18,4,94,4
103,110,2,27,4,71,10,58,10
201,110,3,53,3,10,8,67,5
202,150,3,31,7,96,2,40,6
301,136,4,49,6,28,8,58,5
302,144,3,13,6,42,4,74,3
303,148,2,13,2,61,9,31,5
304,122,5,81,8,33,6,52,5
305,132,4,44,4,78,3,78,4
GEOID,POP,POPs,CONST1,CONST1s,CONST2,CONST2s,CONST3,CONST3s,CONST4,CONST4s,CONST5,CONST5s
10,28,4,15,3.5,2,1,6,3,1,0.5,1,0.5
11,20,4,13,3.5,5,3,2,1,2,1,7,2
20,30,4,23,3.5,5,3,3,2,0,0.1,3,2
21,22,4,19,3.5,8,3.5,4,2,2,1,4,2
GEOID,POP,POPs,CONST1,CONST1s,CONST2,CONST2s,CONST3,CONST3s,CONST4,CONST4s,CONST5,CONST5s,CONST6,CONST6s
10,28,2,15,1.75,25,2.25,15,3,3,0.5,16,2.75,5,1
11,20,2,13,1.5,13,2.25,12,2,2,0.5,17,2.75,8,1
20,30,2,18,1.75,22,2.25,20,2,3,0.5,27,2.75,7,1
21,22,2,19,1.75,15,2.25,18,2,2,0.5,20,2.75,10,1
SERIAL,PERWT,POP,CONST1,CONST2,CONST3
p1,24,1,1,0,0
p2,79,1,0,0,1
p3,95,1,1,1,0
p4,95,1,0,1,0
p5,55,1,1,1,1
p6,69,1,0,0,0
p7,154,1,1,0,0
p8,24,1,1,0,1
p9,20,1,1,1,0
p10,14,1,1,0,1
p11,45,1,1,0,0
p12,179,1,0,0,1
p13,197,1,0,1,0
p14,57,1,0,0,1
p15,181,1,0,0,1
SERIAL,PERWT,CONST1,CONST2,CONST3,CONST4,CONST5
A,50,1,0,0,0,0
B,20,1,1,0,0,0
C,10,0,0,1,0,0
D,5,0,0,1,1,0
E,15,0,0,0,0,1
SERIAL,PERWT,POP,CONST1,CONST2,CONST3,CONST4,CONST5,CONST6
A,50,1,1,1,1,0,1,0
B,20,1,0,0,0,0,1,1
C,10,1,1,1,0,1,1,1
D,5,1,1,0,0,0,0,0
E,15,1,0,1,1,0,0,0
"","GEOID","POP","CONST1","CONST2","CONST3","POPs","CONST1s","CONST2s","CONST3s"
"1","10",344,152,101,175,9.4339811320566,7.81024967590665,12.328828005938,11.1803398874989
"2","20",260,84,106,107,4.24264068711928,7.61577310586391,8.24621125123532,7.81024967590665
"3","30",682,200,242,293,8.36660026534076,12.4899959967968,14.3527000944073,10
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 29 18:28:37 2021
@author: joe
"""
#%%
import sys
sys.path.append('code')
import numpy as np
import pandas as pd
import pmedm
# from pmedm import pmedm, f, g, h, solve
## Individual constraints
# cind = pd.read_csv('data/toy_constraints_ind3.csv')
cind = pd.read_csv('data/toy_constraints_ind.csv')
# cbg0 = pd.read_csv('data/toy_constraints_bg3.csv')
cbg0 = pd.read_csv('data/toy_constraints_bg.csv')
## Block group constraints
cbg0.index = cbg0.GEOID.values
# separate ests and standard errors
se_cols = cbg0.columns[np.where([k.endswith('s') for k in cbg0.columns])]
est_cols = cbg0.columns[np.where([(k not in se_cols) & (k != 'GEOID') for k in cbg0.columns])]
cbg = cbg0[est_cols]
sbg = cbg0[se_cols]
## Tract constraints
ctrt0 = cbg0.copy()
ctrt0.index = [str(i)[0] for i in ctrt0.GEOID.values]
ctrt0 = ctrt0.drop('GEOID', axis = 1)
# separate ests and standard errors
ctrt = ctrt0[est_cols]
strt = ctrt0[se_cols]
# aggregate ests
ctrt = ctrt.groupby(ctrt.index).aggregate(sum)
# aggregate SE's
strt = strt.groupby(strt.index).aggregate(lambda x: np.sqrt(np.sum(np.square(x))))
## response IDs
serial = cind.SERIAL.values
## sample weights
wt = cind.PERWT.values
## keep only constraints
cind = cind.drop(['SERIAL', 'PERWT'], axis = 1)
#%%
#%%
## Create P-MEDM object
pmd = pmedm.pmedm(serial = serial, wt = wt, cind = cind, cg1 = ctrt, cg2 = cbg,\
sg1 = strt, sg2 = sbg)
#%%
#%%
res = pmd.solve()
#%%
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment