Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Cage, Marshall Andrew
arctool
Commits
4ebf9c71
Commit
4ebf9c71
authored
Aug 09, 2016
by
Cage, Marshall Andrew
Browse files
Improve duplicate removal in email dump
parent
0d0084a8
Changes
1
Show whitespace changes
Inline
Side-by-side
packages/Default/emaildump.py
View file @
4ebf9c71
...
...
@@ -35,6 +35,7 @@ class Plugin(arcclasses.Plugin):
self
.
contextFilters
=
[]
self
.
igHeader
=
False
self
.
addHeader
=
False
self
.
igPles
=
False
self
.
igReplies
=
False
self
.
igDup
=
False
...
...
@@ -72,6 +73,9 @@ class Plugin(arcclasses.Plugin):
self
.
widget
.
contextCheck
.
stateChanged
.
connect
(
lambda
x
:
self
.
widget
.
fetchButton
.
setEnabled
(
x
>
0
)
)
self
.
widget
.
infoCheck
.
stateChanged
.
connect
(
lambda
x
:
self
.
setAddHeader
(
x
>
0
)
)
self
.
widget
.
headerCheck
.
stateChanged
.
connect
(
lambda
x
:
self
.
setIgHeader
(
x
>
0
)
)
...
...
@@ -124,6 +128,8 @@ class Plugin(arcclasses.Plugin):
(
self
.
widget
.
delimeterEdit
.
text
(),
'text'
)
self
.
options
[
'contextCheck'
]
=
\
(
self
.
widget
.
contextCheck
.
isChecked
(),
'checked'
)
self
.
options
[
'infoCheck'
]
=
\
(
self
.
widget
.
infoCheck
.
isChecked
(),
'checked'
)
self
.
options
[
'headerCheck'
]
=
\
(
self
.
widget
.
headerCheck
.
isChecked
(),
'checked'
)
self
.
options
[
'plesCheck'
]
=
\
...
...
@@ -155,6 +161,9 @@ class Plugin(arcclasses.Plugin):
doc
=
QTextDocument
()
cursor
=
QTextCursor
(
doc
)
# For add headers
headers
=
[]
_c
=
_b
=
None
for
message
in
self
.
emails
:
_c
=
cursor
.
charFormat
()
...
...
@@ -204,9 +213,13 @@ class Plugin(arcclasses.Plugin):
if
self
.
igHeader
:
text
=
self
.
stripHeaders
(
message
,
text
)
# print('header ignored')
if
self
.
addHeader
:
cursor
.
insertHtml
(
'<br/><p id=29>'
+
chr
(
29
)
+
'</p>'
)
headers
.
append
(
self
.
getHeader
(
message
))
# print('custom header added')
cursor
.
insertHtml
(
text
)
if
self
.
delim
!=
''
:
cursor
.
insertHtml
(
'<br/><p>'
+
chr
(
2
)
+
'</p><br/><br/>'
)
cursor
.
insertHtml
(
'<br/><p>'
+
chr
(
2
6
)
+
'</p><br/><br/>'
)
# print('message inserted')
cursor
.
insertBlock
()
cursor
.
setCharFormat
(
_c
)
...
...
@@ -224,8 +237,15 @@ class Plugin(arcclasses.Plugin):
doc
.
setHtml
(
self
.
stripDuplicate
(
doc
.
toHtml
()))
print
(
'duplicates stripped'
)
# Delimiter (before ples strip in case we accdntly lose a marker)
doc
.
setHtml
(
doc
.
toHtml
().
replace
(
chr
(
2
),
self
.
delim
))
doc
.
setHtml
(
doc
.
toHtml
().
replace
(
chr
(
26
),
self
.
delim
))
if
self
.
addHeader
:
text
=
doc
.
toHtml
()
for
h
in
headers
:
text
=
text
.
replace
(
chr
(
29
),
h
,
1
)
doc
.
setHtml
(
text
)
if
self
.
igPles
:
# This isn't in a function because it works on the QTextDoc
# doc.setHtml(self.stripPleasantries(doc.toHtml()))
plc
=
QTextCursor
(
doc
)
go
=
True
while
go
:
...
...
@@ -242,6 +262,8 @@ class Plugin(arcclasses.Plugin):
plc
.
removeSelectedText
()
go
=
plc
.
movePosition
(
plc
.
NextBlock
)
print
(
'pleasantries forgone'
)
print
(
doc
.
toHtml
().
count
(
chr
(
29
)))
if
self
.
igSpace
:
doc
.
setHtml
(
self
.
collapseSpace
(
doc
.
toHtml
()))
...
...
@@ -364,6 +386,9 @@ class Plugin(arcclasses.Plugin):
self
.
fetched
=
True
return
0
def
setAddHeader
(
self
,
b
):
self
.
addHeader
=
b
def
setIgHeader
(
self
,
b
):
self
.
igHeader
=
b
...
...
@@ -492,58 +517,93 @@ class Plugin(arcclasses.Plugin):
bodyText
=
re
.
sub
(
'
\xa0
'
,
' '
,
bodyText
)
bodyRef
=
re
.
sub
(
'
\xa0
'
,
' '
,
bodyRef
)
bodyTags
=
re
.
findall
(
r
'\s*<.+?>\s*'
,
bodyText
)
bodyText
=
re
.
sub
(
r
'\s*<.+?>\s*'
,
chr
(
1
),
bodyText
)
bodyText
=
re
.
sub
(
r
'\s*<.+?>\s*'
,
chr
(
24
),
bodyText
)
bodyRef
=
[
re
.
escape
(
p
)
for
p
in
\
re
.
split
(
r
'\s*<.+?>\s*'
,
bodyRef
)
if
p
!=
''
]
bodyRef
=
'(?s)'
+
(
chr
(
1
)
+
'+'
).
join
(
bodyRef
)
bodyRef
=
'(?s)'
+
(
chr
(
24
)
+
'+'
).
join
(
bodyRef
)
bodyRef
=
re
.
sub
(
r
'(\\\s)+'
,
r
'\\s+'
,
bodyRef
)
# Search for the original message within the reply
partial
=
re
.
search
(
bodyRef
,
bodyText
)
if
partial
:
excise
=
partial
.
group
(
0
).
count
(
chr
(
1
))
excise
=
partial
.
group
(
0
).
count
(
chr
(
24
))
offset
=
\
bodyText
[:
bodyText
.
find
(
partial
.
group
(
0
))]
\
.
count
(
chr
(
1
))
.
count
(
chr
(
24
))
bodyText
=
bodyText
.
replace
(
partial
.
group
(
0
),
''
)
bodyTags
=
(
bodyTags
[:
offset
]
+
bodyTags
[
offset
+
excise
:])
# Replace the tags
for
t
in
bodyTags
:
bodyText
=
bodyText
.
replace
(
chr
(
1
),
t
,
1
)
bodyText
=
bodyText
.
replace
(
chr
(
24
),
t
,
1
)
text
=
container
[
0
]
+
bodyText
+
container
[
1
]
# We messed up? Impossible...
# else:
# print(bodyText)
# print('-------vvvvvv------')
# print('-------vvvvvv------
-
')
# print(bodyRef)
return
text
def
getHeader
(
self
,
message
):
text
=
'<br/><span><p style="text-align:center;font-weight:bold;">From '
# print(message.keys())
text
+=
message
.
get
(
'From'
)
text
+=
' on '
text
+=
message
.
get
(
'Date'
)
text
+=
':</p></span><br/>'
return
text
def
stripDuplicate
(
self
,
text
):
num
=
self
.
widget
.
dupStrength
.
value
()
globs
=
{}
body
=
re
.
search
(
'(?s)<body[^>]*?>(.+)</body>'
,
text
).
group
(
1
)
container
=
text
.
split
(
body
)
tags
=
re
.
findall
(
r
'(?s)\s*<.+?>\s*'
,
body
)
print
(
len
(
tags
),
tags
[
0
])
# Replace tags
stripped
=
re
.
sub
(
r
'(?s)\s*<.+?>\s*'
,
chr
(
1
),
body
)
stripped
=
re
.
sub
(
r
'(?s)\s*<.+?>\s*'
,
chr
(
24
),
body
)
# Replace
stripped
=
re
.
sub
(
'
\xa0
'
,
' '
,
stripped
)
stripped
=
re
.
sub
(
r
' '
,
' '
,
stripped
)
f
=
open
(
'wspace.txt'
,
'w'
)
f
.
write
(
stripped
)
f
.
close
()
spaces
=
re
.
findall
(
r
'(?s)\s+'
,
stripped
)
print
(
len
(
spaces
),
spaces
[
0
])
stripped
=
re
.
sub
(
r
'(?s)\s+'
,
r
' '
,
stripped
)
f
=
open
(
'wospace.txt'
,
'w'
)
f
.
write
(
stripped
)
f
.
close
()
print
(
"stripped"
)
_
=
[]
# for word in re.finditer(r'(?:^| )(.+?)(?:(?= )|$)',stripped):
for
word
in
re
.
finditer
(
r
'(?:\b)(.+?)(?:\b)'
,
stripped
):
for
word
in
re
.
finditer
(
r
'(?
s)(?
:\b)(.+?
\s+?
)(?:\b)'
,
stripped
):
# for word in re.finditer(r'(?:\s)(.+?)(?:\s)',stripped):
_
.
append
((
word
.
group
(
1
),
word
.
start
(
1
),
word
.
end
(
1
)))
_
.
append
(
(
word
.
group
(
1
),
word
.
start
(
1
),
word
.
end
(
1
))
)
if
(
chr
(
24
)
in
_
[
-
1
][
0
]
or
re
.
search
(
r
'[-;:/.,=\\!?@#$%^&*()_+<>\[\]|{}]'
,
_
[
-
1
][
0
])):
_
.
pop
(
-
1
)
# print('-> '+_.pop(-1)[0])
continue
if
len
(
_
)
==
num
:
hsh
=
''
.
join
(
x
[
0
].
replace
(
chr
(
1
),
''
)
for
x
in
_
).
__hash__
()
# I think its missing the collision with the dot after the 11
# I tried removing spaces but that doesnt seem to actually
# affect the results. At leasts it's vastly improved from b4
hsh
=
''
.
join
(
x
[
0
].
replace
(
chr
(
24
),
''
)
for
x
in
_
)
hsh
=
hsh
.
replace
(
' '
,
''
)
hsh
=
hsh
.
__hash__
()
if
hsh
not
in
globs
:
globs
[
hsh
]
=
[]
else
:
print
(
'collision'
,
hsh
,
''
.
join
(
x
[
0
].
replace
(
chr
(
24
),
''
)
for
x
in
_
),
len
(
globs
[
hsh
]))
globs
[
hsh
].
append
((
_
[
0
][
1
],
_
[
-
1
][
2
]))
_
.
pop
(
0
)
# print('-> '+_.pop(0)[0])
# print(_)
# input()
print
(
"globbed"
)
chains
=
[]
...
...
@@ -573,19 +633,19 @@ class Plugin(arcclasses.Plugin):
lh
,
lt
=
len
(
head
),
len
(
tail
)
# print(lh, lt, lh-lt)
tagsRemoved
=
chunk
.
count
(
chr
(
1
))
tagsRemoved
=
chunk
.
count
(
chr
(
24
))
spaceRemoved
=
chunk
.
count
(
' '
)
offset
+=
len
(
chunk
)
-
tagsRemoved
spaces
=
(
spaces
[:
head
.
count
(
' '
)]
+
spaces
[
head
.
count
(
' '
)
+
spaceRemoved
:])
stripped
=
head
+
chr
(
1
)
*
tagsRemoved
+
tail
stripped
=
head
+
chr
(
24
)
*
tagsRemoved
+
tail
print
(
"excised"
)
for
s
in
spaces
:
stripped
=
stripped
.
replace
(
' '
,
s
,
1
)
for
t
in
tags
:
stripped
=
stripped
.
replace
(
chr
(
1
),
t
,
1
)
stripped
=
stripped
.
replace
(
chr
(
24
),
t
,
1
)
print
(
"replaced"
)
return
container
[
0
]
+
stripped
+
container
[
1
]
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment