Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Cage, Marshall Andrew
arctool
Commits
4ebf9c71
Commit
4ebf9c71
authored
Aug 09, 2016
by
Cage, Marshall Andrew
Browse files
Improve duplicate removal in email dump
parent
0d0084a8
Changes
1
Hide whitespace changes
Inline
Side-by-side
packages/Default/emaildump.py
View file @
4ebf9c71
...
@@ -35,6 +35,7 @@ class Plugin(arcclasses.Plugin):
...
@@ -35,6 +35,7 @@ class Plugin(arcclasses.Plugin):
self
.
contextFilters
=
[]
self
.
contextFilters
=
[]
self
.
igHeader
=
False
self
.
igHeader
=
False
self
.
addHeader
=
False
self
.
igPles
=
False
self
.
igPles
=
False
self
.
igReplies
=
False
self
.
igReplies
=
False
self
.
igDup
=
False
self
.
igDup
=
False
...
@@ -72,6 +73,9 @@ class Plugin(arcclasses.Plugin):
...
@@ -72,6 +73,9 @@ class Plugin(arcclasses.Plugin):
self
.
widget
.
contextCheck
.
stateChanged
.
connect
(
self
.
widget
.
contextCheck
.
stateChanged
.
connect
(
lambda
x
:
self
.
widget
.
fetchButton
.
setEnabled
(
x
>
0
)
lambda
x
:
self
.
widget
.
fetchButton
.
setEnabled
(
x
>
0
)
)
)
self
.
widget
.
infoCheck
.
stateChanged
.
connect
(
lambda
x
:
self
.
setAddHeader
(
x
>
0
)
)
self
.
widget
.
headerCheck
.
stateChanged
.
connect
(
self
.
widget
.
headerCheck
.
stateChanged
.
connect
(
lambda
x
:
self
.
setIgHeader
(
x
>
0
)
lambda
x
:
self
.
setIgHeader
(
x
>
0
)
)
)
...
@@ -124,6 +128,8 @@ class Plugin(arcclasses.Plugin):
...
@@ -124,6 +128,8 @@ class Plugin(arcclasses.Plugin):
(
self
.
widget
.
delimeterEdit
.
text
(),
'text'
)
(
self
.
widget
.
delimeterEdit
.
text
(),
'text'
)
self
.
options
[
'contextCheck'
]
=
\
self
.
options
[
'contextCheck'
]
=
\
(
self
.
widget
.
contextCheck
.
isChecked
(),
'checked'
)
(
self
.
widget
.
contextCheck
.
isChecked
(),
'checked'
)
self
.
options
[
'infoCheck'
]
=
\
(
self
.
widget
.
infoCheck
.
isChecked
(),
'checked'
)
self
.
options
[
'headerCheck'
]
=
\
self
.
options
[
'headerCheck'
]
=
\
(
self
.
widget
.
headerCheck
.
isChecked
(),
'checked'
)
(
self
.
widget
.
headerCheck
.
isChecked
(),
'checked'
)
self
.
options
[
'plesCheck'
]
=
\
self
.
options
[
'plesCheck'
]
=
\
...
@@ -155,6 +161,9 @@ class Plugin(arcclasses.Plugin):
...
@@ -155,6 +161,9 @@ class Plugin(arcclasses.Plugin):
doc
=
QTextDocument
()
doc
=
QTextDocument
()
cursor
=
QTextCursor
(
doc
)
cursor
=
QTextCursor
(
doc
)
# For add headers
headers
=
[]
_c
=
_b
=
None
_c
=
_b
=
None
for
message
in
self
.
emails
:
for
message
in
self
.
emails
:
_c
=
cursor
.
charFormat
()
_c
=
cursor
.
charFormat
()
...
@@ -204,9 +213,13 @@ class Plugin(arcclasses.Plugin):
...
@@ -204,9 +213,13 @@ class Plugin(arcclasses.Plugin):
if
self
.
igHeader
:
if
self
.
igHeader
:
text
=
self
.
stripHeaders
(
message
,
text
)
text
=
self
.
stripHeaders
(
message
,
text
)
# print('header ignored')
# print('header ignored')
if
self
.
addHeader
:
cursor
.
insertHtml
(
'<br/><p id=29>'
+
chr
(
29
)
+
'</p>'
)
headers
.
append
(
self
.
getHeader
(
message
))
# print('custom header added')
cursor
.
insertHtml
(
text
)
cursor
.
insertHtml
(
text
)
if
self
.
delim
!=
''
:
if
self
.
delim
!=
''
:
cursor
.
insertHtml
(
'<br/><p>'
+
chr
(
2
)
+
'</p><br/><br/>'
)
cursor
.
insertHtml
(
'<br/><p>'
+
chr
(
2
6
)
+
'</p><br/><br/>'
)
# print('message inserted')
# print('message inserted')
cursor
.
insertBlock
()
cursor
.
insertBlock
()
cursor
.
setCharFormat
(
_c
)
cursor
.
setCharFormat
(
_c
)
...
@@ -224,8 +237,15 @@ class Plugin(arcclasses.Plugin):
...
@@ -224,8 +237,15 @@ class Plugin(arcclasses.Plugin):
doc
.
setHtml
(
self
.
stripDuplicate
(
doc
.
toHtml
()))
doc
.
setHtml
(
self
.
stripDuplicate
(
doc
.
toHtml
()))
print
(
'duplicates stripped'
)
print
(
'duplicates stripped'
)
# Delimiter (before ples strip in case we accdntly lose a marker)
# Delimiter (before ples strip in case we accdntly lose a marker)
doc
.
setHtml
(
doc
.
toHtml
().
replace
(
chr
(
2
),
self
.
delim
))
doc
.
setHtml
(
doc
.
toHtml
().
replace
(
chr
(
26
),
self
.
delim
))
if
self
.
addHeader
:
text
=
doc
.
toHtml
()
for
h
in
headers
:
text
=
text
.
replace
(
chr
(
29
),
h
,
1
)
doc
.
setHtml
(
text
)
if
self
.
igPles
:
if
self
.
igPles
:
# This isn't in a function because it works on the QTextDoc
# doc.setHtml(self.stripPleasantries(doc.toHtml()))
plc
=
QTextCursor
(
doc
)
plc
=
QTextCursor
(
doc
)
go
=
True
go
=
True
while
go
:
while
go
:
...
@@ -242,6 +262,8 @@ class Plugin(arcclasses.Plugin):
...
@@ -242,6 +262,8 @@ class Plugin(arcclasses.Plugin):
plc
.
removeSelectedText
()
plc
.
removeSelectedText
()
go
=
plc
.
movePosition
(
plc
.
NextBlock
)
go
=
plc
.
movePosition
(
plc
.
NextBlock
)
print
(
'pleasantries forgone'
)
print
(
'pleasantries forgone'
)
print
(
doc
.
toHtml
().
count
(
chr
(
29
)))
if
self
.
igSpace
:
if
self
.
igSpace
:
doc
.
setHtml
(
self
.
collapseSpace
(
doc
.
toHtml
()))
doc
.
setHtml
(
self
.
collapseSpace
(
doc
.
toHtml
()))
...
@@ -364,6 +386,9 @@ class Plugin(arcclasses.Plugin):
...
@@ -364,6 +386,9 @@ class Plugin(arcclasses.Plugin):
self
.
fetched
=
True
self
.
fetched
=
True
return
0
return
0
def
setAddHeader
(
self
,
b
):
self
.
addHeader
=
b
def
setIgHeader
(
self
,
b
):
def
setIgHeader
(
self
,
b
):
self
.
igHeader
=
b
self
.
igHeader
=
b
...
@@ -492,58 +517,93 @@ class Plugin(arcclasses.Plugin):
...
@@ -492,58 +517,93 @@ class Plugin(arcclasses.Plugin):
bodyText
=
re
.
sub
(
'
\xa0
'
,
' '
,
bodyText
)
bodyText
=
re
.
sub
(
'
\xa0
'
,
' '
,
bodyText
)
bodyRef
=
re
.
sub
(
'
\xa0
'
,
' '
,
bodyRef
)
bodyRef
=
re
.
sub
(
'
\xa0
'
,
' '
,
bodyRef
)
bodyTags
=
re
.
findall
(
r
'\s*<.+?>\s*'
,
bodyText
)
bodyTags
=
re
.
findall
(
r
'\s*<.+?>\s*'
,
bodyText
)
bodyText
=
re
.
sub
(
r
'\s*<.+?>\s*'
,
chr
(
1
),
bodyText
)
bodyText
=
re
.
sub
(
r
'\s*<.+?>\s*'
,
chr
(
24
),
bodyText
)
bodyRef
=
[
re
.
escape
(
p
)
for
p
in
\
bodyRef
=
[
re
.
escape
(
p
)
for
p
in
\
re
.
split
(
r
'\s*<.+?>\s*'
,
bodyRef
)
if
p
!=
''
]
re
.
split
(
r
'\s*<.+?>\s*'
,
bodyRef
)
if
p
!=
''
]
bodyRef
=
'(?s)'
+
(
chr
(
1
)
+
'+'
).
join
(
bodyRef
)
bodyRef
=
'(?s)'
+
(
chr
(
24
)
+
'+'
).
join
(
bodyRef
)
bodyRef
=
re
.
sub
(
r
'(\\\s)+'
,
r
'\\s+'
,
bodyRef
)
bodyRef
=
re
.
sub
(
r
'(\\\s)+'
,
r
'\\s+'
,
bodyRef
)
# Search for the original message within the reply
# Search for the original message within the reply
partial
=
re
.
search
(
bodyRef
,
bodyText
)
partial
=
re
.
search
(
bodyRef
,
bodyText
)
if
partial
:
if
partial
:
excise
=
partial
.
group
(
0
).
count
(
chr
(
1
))
excise
=
partial
.
group
(
0
).
count
(
chr
(
24
))
offset
=
\
offset
=
\
bodyText
[:
bodyText
.
find
(
partial
.
group
(
0
))]
\
bodyText
[:
bodyText
.
find
(
partial
.
group
(
0
))]
\
.
count
(
chr
(
1
))
.
count
(
chr
(
24
))
bodyText
=
bodyText
.
replace
(
partial
.
group
(
0
),
''
)
bodyText
=
bodyText
.
replace
(
partial
.
group
(
0
),
''
)
bodyTags
=
(
bodyTags
[:
offset
]
bodyTags
=
(
bodyTags
[:
offset
]
+
bodyTags
[
offset
+
excise
:])
+
bodyTags
[
offset
+
excise
:])
# Replace the tags
# Replace the tags
for
t
in
bodyTags
:
for
t
in
bodyTags
:
bodyText
=
bodyText
.
replace
(
chr
(
1
),
t
,
1
)
bodyText
=
bodyText
.
replace
(
chr
(
24
),
t
,
1
)
text
=
container
[
0
]
+
bodyText
+
container
[
1
]
text
=
container
[
0
]
+
bodyText
+
container
[
1
]
# We messed up? Impossible...
# We messed up? Impossible...
# else:
# else:
# print(bodyText)
# print(bodyText)
# print('-------vvvvvv------')
# print('-------vvvvvv------
-
')
# print(bodyRef)
# print(bodyRef)
return
text
return
text
def
getHeader
(
self
,
message
):
text
=
'<br/><span><p style="text-align:center;font-weight:bold;">From '
# print(message.keys())
text
+=
message
.
get
(
'From'
)
text
+=
' on '
text
+=
message
.
get
(
'Date'
)
text
+=
':</p></span><br/>'
return
text
def
stripDuplicate
(
self
,
text
):
def
stripDuplicate
(
self
,
text
):
num
=
self
.
widget
.
dupStrength
.
value
()
num
=
self
.
widget
.
dupStrength
.
value
()
globs
=
{}
globs
=
{}
body
=
re
.
search
(
'(?s)<body[^>]*?>(.+)</body>'
,
text
).
group
(
1
)
body
=
re
.
search
(
'(?s)<body[^>]*?>(.+)</body>'
,
text
).
group
(
1
)
container
=
text
.
split
(
body
)
container
=
text
.
split
(
body
)
tags
=
re
.
findall
(
r
'(?s)\s*<.+?>\s*'
,
body
)
tags
=
re
.
findall
(
r
'(?s)\s*<.+?>\s*'
,
body
)
print
(
len
(
tags
),
tags
[
0
])
# Replace tags
# Replace tags
stripped
=
re
.
sub
(
r
'(?s)\s*<.+?>\s*'
,
chr
(
1
),
body
)
stripped
=
re
.
sub
(
r
'(?s)\s*<.+?>\s*'
,
chr
(
24
),
body
)
# Replace
# Replace
stripped
=
re
.
sub
(
'
\xa0
'
,
' '
,
stripped
)
stripped
=
re
.
sub
(
'
\xa0
'
,
' '
,
stripped
)
stripped
=
re
.
sub
(
r
' '
,
' '
,
stripped
)
stripped
=
re
.
sub
(
r
' '
,
' '
,
stripped
)
f
=
open
(
'wspace.txt'
,
'w'
)
f
.
write
(
stripped
)
f
.
close
()
spaces
=
re
.
findall
(
r
'(?s)\s+'
,
stripped
)
spaces
=
re
.
findall
(
r
'(?s)\s+'
,
stripped
)
print
(
len
(
spaces
),
spaces
[
0
])
stripped
=
re
.
sub
(
r
'(?s)\s+'
,
r
' '
,
stripped
)
stripped
=
re
.
sub
(
r
'(?s)\s+'
,
r
' '
,
stripped
)
f
=
open
(
'wospace.txt'
,
'w'
)
f
.
write
(
stripped
)
f
.
close
()
print
(
"stripped"
)
print
(
"stripped"
)
_
=
[]
_
=
[]
# for word in re.finditer(r'(?:^| )(.+?)(?:(?= )|$)',stripped):
# for word in re.finditer(r'(?:^| )(.+?)(?:(?= )|$)',stripped):
for
word
in
re
.
finditer
(
r
'(?:\b)(.+?)(?:\b)'
,
stripped
):
for
word
in
re
.
finditer
(
r
'(?
s)(?
:\b)(.+?
\s+?
)(?:\b)'
,
stripped
):
# for word in re.finditer(r'(?:\s)(.+?)(?:\s)',stripped):
# for word in re.finditer(r'(?:\s)(.+?)(?:\s)',stripped):
_
.
append
((
word
.
group
(
1
),
word
.
start
(
1
),
word
.
end
(
1
)))
_
.
append
(
(
word
.
group
(
1
),
word
.
start
(
1
),
word
.
end
(
1
))
)
if
(
chr
(
24
)
in
_
[
-
1
][
0
]
or
re
.
search
(
r
'[-;:/.,=\\!?@#$%^&*()_+<>\[\]|{}]'
,
_
[
-
1
][
0
])):
_
.
pop
(
-
1
)
# print('-> '+_.pop(-1)[0])
continue
if
len
(
_
)
==
num
:
if
len
(
_
)
==
num
:
hsh
=
''
.
join
(
x
[
0
].
replace
(
chr
(
1
),
''
)
for
x
in
_
).
__hash__
()
# I think its missing the collision with the dot after the 11
# I tried removing spaces but that doesnt seem to actually
# affect the results. At leasts it's vastly improved from b4
hsh
=
''
.
join
(
x
[
0
].
replace
(
chr
(
24
),
''
)
for
x
in
_
)
hsh
=
hsh
.
replace
(
' '
,
''
)
hsh
=
hsh
.
__hash__
()
if
hsh
not
in
globs
:
if
hsh
not
in
globs
:
globs
[
hsh
]
=
[]
globs
[
hsh
]
=
[]
else
:
print
(
'collision'
,
hsh
,
''
.
join
(
x
[
0
].
replace
(
chr
(
24
),
''
)
for
x
in
_
),
len
(
globs
[
hsh
]))
globs
[
hsh
].
append
((
_
[
0
][
1
],
_
[
-
1
][
2
]))
globs
[
hsh
].
append
((
_
[
0
][
1
],
_
[
-
1
][
2
]))
_
.
pop
(
0
)
_
.
pop
(
0
)
# print('-> '+_.pop(0)[0])
# print(_)
# input()
print
(
"globbed"
)
print
(
"globbed"
)
chains
=
[]
chains
=
[]
...
@@ -573,19 +633,19 @@ class Plugin(arcclasses.Plugin):
...
@@ -573,19 +633,19 @@ class Plugin(arcclasses.Plugin):
lh
,
lt
=
len
(
head
),
len
(
tail
)
lh
,
lt
=
len
(
head
),
len
(
tail
)
# print(lh, lt, lh-lt)
# print(lh, lt, lh-lt)
tagsRemoved
=
chunk
.
count
(
chr
(
1
))
tagsRemoved
=
chunk
.
count
(
chr
(
24
))
spaceRemoved
=
chunk
.
count
(
' '
)
spaceRemoved
=
chunk
.
count
(
' '
)
offset
+=
len
(
chunk
)
-
tagsRemoved
offset
+=
len
(
chunk
)
-
tagsRemoved
spaces
=
(
spaces
[:
head
.
count
(
' '
)]
spaces
=
(
spaces
[:
head
.
count
(
' '
)]
+
spaces
[
head
.
count
(
' '
)
+
spaceRemoved
:])
+
spaces
[
head
.
count
(
' '
)
+
spaceRemoved
:])
stripped
=
head
+
chr
(
1
)
*
tagsRemoved
+
tail
stripped
=
head
+
chr
(
24
)
*
tagsRemoved
+
tail
print
(
"excised"
)
print
(
"excised"
)
for
s
in
spaces
:
for
s
in
spaces
:
stripped
=
stripped
.
replace
(
' '
,
s
,
1
)
stripped
=
stripped
.
replace
(
' '
,
s
,
1
)
for
t
in
tags
:
for
t
in
tags
:
stripped
=
stripped
.
replace
(
chr
(
1
),
t
,
1
)
stripped
=
stripped
.
replace
(
chr
(
24
),
t
,
1
)
print
(
"replaced"
)
print
(
"replaced"
)
return
container
[
0
]
+
stripped
+
container
[
1
]
return
container
[
0
]
+
stripped
+
container
[
1
]
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment