Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Kotevska, Olivera
privacy-standard-techniques
Commits
21929b55
Commit
21929b55
authored
Sep 12, 2021
by
Kotevska, Olivera
Browse files
Delete e-differential-privacy.py
parent
c1081ea0
Changes
1
Show whitespace changes
Inline
Side-by-side
e-differential-privacy.py
deleted
100644 → 0
View file @
c1081ea0
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 15 14:50:30 2021
@author: marti
"""
import
pandas
as
pd
import
matplotlib.pylab
as
pl
import
matplotlib.patches
as
patches
categorical
=
[]
time_rows
=
[]
temp_partitions
=
[]
actual_partitions
=
[]
count_partition
=
0
count_row
=
0
count_column
=
0
cols_to_use
=
[
'Time'
,
'Household 1'
,
'Household 2'
,
'Household 3'
]
df
=
pd
.
read_excel
(
"/Users/ok0/OneDrive - Oak Ridge National Laboratory/Work in progress/Mentoring/Students/2021/SULI/project_desc_tasks_plan/code/Residential-Profiles.xlsx"
,
sheet_name
=
'Residential-Profiles.csv'
,
index_col
=
False
,
usecols
=
cols_to_use
)
for
column
in
df
.
columns
[
1
:
4
]:
categorical
.
append
(
column
)
set
(
categorical
)
for
row
in
df
[
'Time'
]:
time_rows
.
append
(
row
)
for
name
in
categorical
:
df
[
name
]
=
df
[
name
].
astype
(
'category'
)
def
get_spans
(
df
,
partition
,
scale
=
None
):
spans
=
{}
for
column
in
df
.
columns
:
if
column
in
categorical
:
span
=
len
(
df
[
column
][
partition
].
unique
())
else
:
span
=
df
[
column
][
partition
].
max
()
-
df
[
column
][
partition
].
min
()
if
scale
is
not
None
:
span
=
span
/
scale
[
column
]
spans
[
column
]
=
span
return
spans
full_spans
=
get_spans
(
df
,
df
.
index
)
def
split
(
df
,
partition
,
column
):
dfp
=
df
[
column
][
partition
]
if
column
in
categorical
:
values
=
dfp
.
unique
()
lv
=
set
(
values
[:
len
(
values
)
//
2
])
rv
=
set
(
values
[
len
(
values
)
//
2
:])
return
dfp
.
index
[
dfp
.
isin
(
lv
)],
dfp
.
index
[
dfp
.
isin
(
rv
)]
else
:
median
=
dfp
.
median
()
dfl
=
dfp
.
index
[
dfp
<
median
]
dfr
=
dfp
.
index
[
dfp
>=
median
]
return
(
dfl
,
dfr
)
def
is_k_anonymous
(
df
,
partition
,
sensitive_column
,
k
=
3
):
if
len
(
partition
)
<
k
:
return
False
return
True
def
partition_dataset
(
df
,
feature_columns
,
sensitive_column
,
scale
,
is_valid
):
finished_partitions
=
[]
partitions
=
[
df
.
index
]
while
partitions
:
partition
=
partitions
.
pop
(
0
)
spans
=
get_spans
(
df
[
feature_columns
],
partition
,
scale
)
for
column
,
span
in
sorted
(
spans
.
items
(),
key
=
lambda
x
:
-
x
[
1
]):
lp
,
rp
=
split
(
df
,
partition
,
column
)
if
not
is_valid
(
df
,
lp
,
sensitive_column
)
or
not
is_valid
(
df
,
rp
,
sensitive_column
):
continue
partitions
.
extend
((
lp
,
rp
))
break
else
:
finished_partitions
.
append
(
partition
)
return
finished_partitions
feature_columns
=
categorical
sensitive_column
=
time_rows
finished_partitions
=
partition_dataset
(
df
,
feature_columns
,
sensitive_column
,
full_spans
,
is_k_anonymous
)
print
(
len
(
finished_partitions
))
def
build_indexes
(
df
):
indexes
=
{}
for
column
in
categorical
:
values
=
sorted
(
df
[
column
].
unique
())
indexes
[
column
]
=
{
x
:
y
for
x
,
y
in
zip
(
values
,
range
(
len
(
values
)))}
return
indexes
def
get_coords
(
df
,
column
,
partition
,
indexes
,
offset
=
0.1
):
if
column
in
categorical
:
sv
=
df
[
column
][
partition
].
sort_values
()
l
,
r
=
indexes
[
column
][
sv
[
sv
.
index
[
0
]]],
indexes
[
column
][
sv
[
sv
.
index
[
-
1
]]]
+
1.0
else
:
sv
=
df
[
column
][
partition
].
sort_values
()
next_value
=
sv
[
sv
.
index
[
-
1
]]
larger_values
=
df
[
df
[
column
]
>
next_value
][
column
]
if
len
(
larger_values
)
>
0
:
next_value
=
larger_values
.
min
()
l
=
sv
[
sv
.
index
[
0
]]
r
=
next_value
l
-=
offset
r
+=
offset
return
l
,
r
def
get_partition_rects
(
df
,
partitions
,
column_x
,
column_y
,
indexes
,
offsets
=
[
0.1
,
0.1
]):
rects
=
[]
for
partition
in
partitions
:
xl
,
xr
=
get_coords
(
df
,
column_x
,
partition
,
indexes
,
offset
=
offsets
[
0
])
yl
,
yr
=
get_coords
(
df
,
column_y
,
partition
,
indexes
,
offset
=
offsets
[
1
])
rects
.
append
(((
xl
,
yl
),(
xr
,
yr
)))
return
rects
def
_update_mean_variance
(
self
,
n_past
,
mu
,
var
,
X
,
sample_weight
=
None
):
if
X
.
shape
[
0
]
==
0
:
return
mu
,
var
if
sample_weight
is
not
None
:
warn_unused_args
(
"sample_weight"
)
n_new
=
X
.
shape
[
0
]
new_var
=
np
.
var
(
X
,
axis
=
0
)
new_mu
=
np
.
mean
(
X
,
axis
=
0
)
new_mu
,
new_var
=
self
.
randomise
(
new_mu
,
new_var
,
self
.
new_n_samples
)
if
n_past
==
0
:
return
new_mu
,
new_var
n_total
=
float
(
n_past
+
n_new
)
total_mu
=
(
n_new
*
new_mu
+
n_past
*
mu
)
/
n_total
old_ssd
=
n_past
*
var
new_ssd
=
n_new
*
new_var
total_ssd
=
old_ssd
+
new_ssd
+
(
n_past
/
float
(
n_new
*
n_total
))
*
(
n_new
*
mu
-
n_new
*
new_mu
)
total_var
=
total_ssd
/
n_total
return
total_mu
,
total_var
def
get_bounds
(
df
,
column
,
indexes
,
offset
=
1.0
):
if
column
in
categorical
:
return
0
-
offset
,
len
(
indexes
[
column
])
+
offset
return
df
[
column
].
min
()
-
offset
,
df
[
column
].
max
()
+
offset
def
randomise
(
self
,
value
):
self
.
check_inputs
(
value
)
scale
=
self
.
_sensitivity
/
(
self
.
_epsilon
-
np
.
log
(
1
-
self
.
_delta
))
unif_rv
=
random
()
-
0.5
return
value
-
scale
*
np
.
sign
(
unif_rv
)
*
np
.
log
(
1
-
2
*
np
.
abs
(
unif_rv
))
indexes
=
build_indexes
(
df
)
column_x
,
column_y
=
feature_columns
[:
2
]
rects
=
get_partition_rects
(
df
,
finished_partitions
,
column_x
,
column_y
,
indexes
,
offsets
=
[
0.0
,
0.0
])
print
(
rects
[:
10
])
def
plot_rects
(
df
,
ax
,
rects
,
column_x
,
column_y
,
edgecolor
=
'black'
,
facecolor
=
'none'
):
for
(
xl
,
yl
),(
xr
,
yr
)
in
rects
:
ax
.
add_patch
(
patches
.
Rectangle
((
xl
,
yl
),
xr
-
xl
,
yr
-
yl
,
linewidth
=
1
,
edgecolor
=
edgecolor
,
facecolor
=
facecolor
,
alpha
=
0.5
))
ax
.
set_xlim
(
*
get_bounds
(
df
,
column_x
,
indexes
))
ax
.
set_ylim
(
*
get_bounds
(
df
,
column_y
,
indexes
))
ax
.
set_xlabel
(
column_x
)
ax
.
set_ylabel
(
column_y
)
pl
.
figure
(
figsize
=
(
20
,
20
))
ax
=
pl
.
subplot
(
111
)
plot_rects
(
df
,
ax
,
rects
,
column_x
,
column_y
,
facecolor
=
'r'
)
pl
.
scatter
(
df
[
column_x
],
df
[
column_y
])
pl
.
show
()
def
agg_categorical_column
(
series
):
return
[
','
.
join
(
set
(
series
))]
def
agg_numerical_column
(
series
):
return
[
series
.
mean
()]
def
build_anonymized_dataset
(
df
,
partitions
,
feature_columns
,
sensitive_column
,
max_partitions
=
None
):
aggregations
=
{}
for
column
in
feature_columns
:
if
column
in
categorical
:
aggregations
[
column
]
=
agg_categorical_column
else
:
aggregations
[
column
]
=
agg_numerical_column
rows
=
[]
for
i
,
partition
in
enumerate
(
partitions
):
if
i
%
100
==
1
:
print
(
"Finished {} partitions..."
.
format
(
i
))
if
max_partitions
is
not
None
and
i
>
max_partitions
:
break
grouped_columns
=
df
.
loc
[
partition
].
agg
(
aggregations
,
squeeze
=
False
)
sensitive_counts
=
df
.
loc
[
partition
].
groupby
(
sensitive_column
).
agg
({
sensitive_column
:
'count'
})
values
=
grouped_columns
.
iloc
[
0
].
to_dict
()
for
sensitive_value
,
count
in
sensitive_counts
[
sensitive_column
].
items
():
if
count
==
0
:
continue
values
.
update
({
sensitive_column
:
sensitive_value
,
'count'
:
count
,
})
rows
.
append
(
values
.
copy
())
return
pd
.
DataFrame
(
rows
)
dfn
=
build_anonymized_dataset
(
df
,
finished_partitions
,
feature_columns
,
'Household 3'
)
print
(
dfn
.
sort_values
(
feature_columns
+
[
sensitive_column
]))
def
diversity
(
df
,
partition
,
column
):
return
len
(
df
[
column
][
partition
].
unique
())
def
is_l_diverse
(
df
,
partition
,
sensitive_column
,
l
=
2
):
return
diversity
(
df
,
partition
,
sensitive_column
)
>=
l
finished_l_diverse_partitions
=
partition_dataset
(
df
,
feature_columns
,
'Household 3'
,
full_spans
,
lambda
*
args
:
is_k_anonymous
(
*
args
)
and
is_l_diverse
(
*
args
))
print
(
len
(
finished_l_diverse_partitions
))
column_x
,
column_y
=
feature_columns
[:
2
]
l_diverse_rects
=
get_partition_rects
(
df
,
finished_l_diverse_partitions
,
column_x
,
column_y
,
indexes
,
offsets
=
[
0.0
,
0.0
])
pl
.
figure
(
figsize
=
(
20
,
20
))
ax
=
pl
.
subplot
(
111
)
plot_rects
(
df
,
ax
,
l_diverse_rects
,
column_x
,
column_y
,
edgecolor
=
'b'
,
facecolor
=
'b'
)
plot_rects
(
df
,
ax
,
rects
,
column_x
,
column_y
,
facecolor
=
'g'
)
pl
.
scatter
(
df
[
column_x
],
df
[
column_y
])
pl
.
show
()
# dfl = build_anonymized_dataset(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
# print(dfl.sort_values([column_x, column_y, sensitive_column]))
global_freqs
=
{}
total_count
=
float
(
len
(
df
))
group_counts
=
df
.
groupby
(
'Household 3'
)[
'Household 3'
].
agg
(
'count'
)
for
value
,
count
in
group_counts
.
to_dict
().
items
():
p
=
count
/
total_count
global_freqs
[
value
]
=
p
print
(
global_freqs
)
def
t_closeness
(
df
,
partition
,
column
,
global_freqs
):
total_count
=
float
(
len
(
partition
))
d_max
=
None
group_counts
=
df
.
loc
[
partition
].
groupby
(
column
)[
column
].
agg
(
'count'
)
for
value
,
count
in
group_counts
.
to_dict
().
items
():
p
=
count
/
total_count
d
=
abs
(
p
-
global_freqs
[
value
])
if
d_max
is
None
or
d
>
d_max
:
d_max
=
d
return
d_max
def
is_t_close
(
df
,
partition
,
sensitive_column
,
global_freqs
,
p
=
0.2
):
if
not
sensitive_column
in
categorical
:
raise
ValueError
(
"this method only works for categorical values"
)
return
t_closeness
(
df
,
partition
,
sensitive_column
,
global_freqs
)
<=
p
finished_t_close_partitions
=
partition_dataset
(
df
,
feature_columns
,
'Household 3'
,
full_spans
,
lambda
*
args
:
is_k_anonymous
(
*
args
)
and
is_t_close
(
*
args
,
global_freqs
))
print
(
len
(
finished_t_close_partitions
))
# dft = build_anonymized_dataset(df, finished_t_close_partitions, feature_columns, sensitive_column)
# print(dft.sort_values([column_x, column_y, sensitive_column]))
column_x
,
column_y
=
feature_columns
[:
2
]
t_close_rects
=
get_partition_rects
(
df
,
finished_t_close_partitions
,
column_x
,
column_y
,
indexes
,
offsets
=
[
0.0
,
0.0
])
pl
.
figure
(
figsize
=
(
20
,
20
))
ax
=
pl
.
subplot
(
111
)
plot_rects
(
df
,
ax
,
t_close_rects
,
column_x
,
column_y
,
edgecolor
=
'k'
,
facecolor
=
'y'
)
pl
.
scatter
(
df
[
column_x
],
df
[
column_y
])
pl
.
show
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment