Commit 88f0c3be authored by Cage, Marshall Andrew's avatar Cage, Marshall Andrew
Browse files

Add preliminary nicety removal

parent 13124e03
......@@ -573,7 +573,7 @@ class ARCTool(QMainWindow):
ARCG.PluginSelectDialog.updateModuleList()
def generateReport(self):
self.ui.statusBar.showMessage("Generating Report")
self.ui.statusBar.showMessage("Generating Report...")
self.ui.exportReport.setEnabled(False)
self.document = ARCD.Document()
self.document.setTitle(self.profile.getName())
......@@ -589,14 +589,19 @@ class ARCTool(QMainWindow):
# s.updateContent()
pb.setValue(i)
if s.hasPlugin():
r = self.document.addSection(s)
if r < 0:
self.ui.statusBar.removeWidget(pb)
self.ui.statusBar.showMessage(
"Generation Failed At " + s.getTitle()
)
self.ui.sectionList.setCurrentRow(toc[1].index(s))
return
try:
r = self.document.addSection(s)
except:
r = -1
else:
if r < 0:
self.ui.statusBar.removeWidget(pb)
self.ui.statusBar.showMessage(
"Generation Failed At " + s.getTitle()
)
self.ui.sectionList.setCurrentRow(toc[1].index(s))
return
self.ui.statusBar.showMessage("Generating Report...")
i += 1
self.ui.statusBar.removeWidget(pb)
......
......@@ -35,6 +35,7 @@ class Plugin(arcclasses.Plugin):
self.contextFilters = []
self.igHeader = False
self.igPles = False
self.igReplies= False
self.igDup = False
self.igFirstDup = False
......@@ -49,8 +50,10 @@ class Plugin(arcclasses.Plugin):
#Filter
self.widget.emailFilterTable = EmailFilterTable()
self.widget.emailFilterTable.tableChanged.connect(
lambda x: self.widget.fetchButton.setProperty('enabled',(x > 0 or
self.widget.contextCheck.isChecked()))
lambda x: self.widget.fetchButton.setProperty('enabled',
(x > 0
or self.widget.contextCheck.isChecked()
or self.widget.selectEdit.text() != ''))
)
self.widget.emailFilterTable.tableChanged.connect(
ARCTool.signalProfileChanged
......@@ -72,6 +75,9 @@ class Plugin(arcclasses.Plugin):
self.widget.headerCheck.stateChanged.connect(
lambda x: self.setIgHeader(x > 0)
)
self.widget.plesCheck.stateChanged.connect(
lambda x: self.setIgPles(x > 0)
)
self.widget.quoteCheck.stateChanged.connect(
lambda x: self.setIgReplies(x > 0)
)
......@@ -120,6 +126,8 @@ class Plugin(arcclasses.Plugin):
(self.widget.contextCheck.isChecked(), 'checked')
self.options['headerCheck'] =\
(self.widget.headerCheck.isChecked(), 'checked')
self.options['plesCheck'] =\
(self.widget.plesCheck.isChecked(), 'checked')
self.options['quoteCheck'] =\
(self.widget.quoteCheck.isChecked(), 'checked')
self.options['dupCheck'] =\
......@@ -159,8 +167,9 @@ class Plugin(arcclasses.Plugin):
if typ == 'text/plain':
try:
text = ('<p>'
+ part.get_payload(decode=True)
.decode('utf-8') + '</p>'
+ re.sub(r'(?<=\r)\n',r'<br/>',
part.get_payload(decode=True).decode('utf-8'))
+ '</p>'
)
except UnicodeDecodeError:
print("Couldn't decode this part")
......@@ -172,7 +181,7 @@ class Plugin(arcclasses.Plugin):
except UnicodeDecodeError:
print("Couldn't decode this part")
continue
elif 'image/' in typ and not self.igImages:
elif 'image/' in typ and self.igImages is not True:
# Embed image
# print('embed image')
dis = dis.split(';',1)
......@@ -186,13 +195,19 @@ class Plugin(arcclasses.Plugin):
QUrl('cid:'+part.get('Content-ID')[1:-1]),
img
)
print('message converted')
if self.igReplies:
text = self.stripReplies(message,text)
# if self.igPles:
# text = self.stripPleasantries(text)
print('replies ignored')
if self.igHeader:
text = self.stripHeaders(message,text)
print('header ignored')
cursor.insertHtml(text)
if self.delim != '':
cursor.insertHtml('<br/><p>' + self.delim + '</p><br/>')
cursor.insertHtml('<br/><p>' + chr(2) + '</p><br/>')
print('message inserted')
cursor.insertBlock()
cursor.setCharFormat(_c)
cursor.setBlockFormat(_b)
......@@ -203,10 +218,29 @@ class Plugin(arcclasses.Plugin):
cursor.setBlockFormat(_b)
if self.igImages:
doc.setHtml(self.stripImages(doc.toHtml()))
print('images stripped')
if self.igDup:
doc.setHtml(self.stripDuplicate(doc.toHtml()))
print('duplicates stripped')
doc.setHtml(doc.toHtml().replace(chr(2),self.delim))
if self.igSpace:
doc.setHtml(self.collapseSpace(doc.toHtml()))
print('whitespace stripped')
if self.igPles:
plc = QTextCursor(doc)
print('a')
print(plc.position(), doc.characterCount())
while plc.position() < doc.characterCount():
print('b', plc.position())
plc.movePosition(plc.EndOfBlock,plc.KeepAnchor)
text = plc.selectedText()
words = re.split(r'\b',text)
if len(words) < 4:
print(text)
plc.removeSelectedText()
plc.movePosition(plc.StartOfBlock)
print('pleasantries forgone')
# print("generated")
return doc
......@@ -214,7 +248,6 @@ class Plugin(arcclasses.Plugin):
def makeRequest(self):
# make async? or at least talk to the user
# Logic
req, r, i = self.widget.emailFilterTable.getRequest(
self.widget.logicBox.text()
......@@ -296,7 +329,12 @@ class Plugin(arcclasses.Plugin):
"Fetching Messages..."
)
self.emails = []
for num in data[0].split():
_ = 1
nums = data[0].split()
for num in nums:
ARCTool.getStatusBar().showMessage(
"Fetching Message %d of %d..." %(_,len(nums))
)
typ, data = M.fetch(num, '(RFC822)')
self.emails.append(
email.message_from_bytes(data[0][1])
......@@ -304,6 +342,7 @@ class Plugin(arcclasses.Plugin):
id = self.emails[-1].get('Message-ID')
if id:
self.emailIds[id] = self.emails[-1]
_ +=1
M.close()
except protocol.error as e:
ARCTool.getStatusBar().showMessage(
......@@ -323,6 +362,9 @@ class Plugin(arcclasses.Plugin):
def setIgHeader(self, b):
self.igHeader = b
def setIgPles(self, b):
self.igPles = b
def setIgReplies(self, b):
self.igReplies = b
......@@ -349,12 +391,61 @@ class Plugin(arcclasses.Plugin):
%(self.headers),'',text)
return text
def stripPleasantries(self,text):
return text
# deltas = []
# blocks = re.split('<br/?>',text)
# if len(blocks) == 1:
# return text
# print('enough blocks')
# for b in blocks[:]:
# _b = re.sub(r'(?s)\s*<.+?>\s*', '',b)
# _b = re.sub('\xa0',' ',_b)
# _b = re.sub(r'&nbsp;',' ',_b)
# if len(re.split(r'\b',_b)) < 4:
# blocks.remove(b)
# print(blocks)
# return ''.join(blocks)
# breaks = re.findall('<br/?>',text)
# blockIndex = [
# len(blocks[x]) + len(breaks[x]) for x in range(len(breaks))
# ] + [len(blocks[-1])]
# print('block indicies', blockIndex)
# for i in range(1,len(blocks)):
# blockIndex[i] += blockIndex[i-1]
# for b in blocks:
# b = re.sub(r'(?s)\s*<.+?>\s*', '',b)
# wc = len(blocks[0])
# for i in range(1,len(blocks)):
# deltas.append( abs(len(blocks[i]) - len(blocks[i-1])) )
# wc = len(blocks[i])
# avg = wc/len(blocks)
# deltaN = sum(deltas) / (2*wc)
# blockLens = [len(b) for b in blocks]
# if deltaN > avg/max(blockLens):
# asc = 0
# while deltas[asc] < avg:
# asc += 1
# des = len(deltas) - 1
# # Naive approach, should really check to see if other islands exist
# while des > asc and deltas[des] < avg:
# des -= 1
# print('start/stop block index', asc, des)
# text = text[blockIndex[asc]:blockIndex[des]]
# return text
def stripImages(self,text):
text = re.sub('(?s)<img .+?(?:/>|</img>)','',text)
return text
def collapseSpace(self,text):
# print("hello")
# Collapse Spaces
text = re.sub('\xa0',' ',text)
text = re.sub(' +',' ',text)
......@@ -429,14 +520,18 @@ class Plugin(arcclasses.Plugin):
body = re.search('(?s)<body[^>]*?>(.+)</body>',text).group(1)
container = text.split(body)
tags = re.findall(r'(?s)\s*<.+?>\s*',body)
# Replace tags
stripped = re.sub(r'(?s)\s*<.+?>\s*',chr(1),body)
# Replace
stripped = re.sub('\xa0',' ',stripped)
stripped = re.sub(r'&nbsp;',' ',stripped)
spaces = re.findall(r'\s+',stripped)
stripped = re.sub(r'\s+',r' ',stripped)
# print("stripped")
spaces = re.findall(r'(?s)\s+',stripped)
stripped = re.sub(r'(?s)\s+',r' ',stripped)
print("stripped")
_ = []
for word in re.finditer(r'(?:^| )(.+?)(?:(?= )|$)',stripped):
# for word in re.finditer(r'(?:^| )(.+?)(?:(?= )|$)',stripped):
for word in re.finditer(r'(?:\b)(.+?)(?:\b)',stripped):
# for word in re.finditer(r'(?:\s)(.+?)(?:\s)',stripped):
_.append((word.group(1),word.start(1),word.end(1)))
if len(_) == num:
hsh = ''.join(x[0].replace(chr(1),'') for x in _).__hash__()
......@@ -445,45 +540,47 @@ class Plugin(arcclasses.Plugin):
globs[hsh].append((_[0][1],_[-1][2]))
_.pop(0)
# print("globbed")
print("globbed")
chains = []
ranges = []
for d in globs:
if len(globs[d]) > 1:
ranges += [x for x in globs[d][1 if self.igFirstDup else 0:]]
ranges += globs[d][1 if self.igFirstDup else 0:]
ranges.sort()
for r in ranges:
if len(chains) == 0:
chains.append([r[0],r[1]])
continue
if r[0] <= chains[-1][1]:
if r[0] <= chains[-1][1] and r[1] > chains[-1][1]:
chains[-1][1] = r[1]
else:
elif r[0] > chains[-1][1]:
chains.append([r[0],r[1]])
# print("chained")
# This is complicated... Only remove the text but keep the tags
print("chained")
# Only remove the text but keep the tags
offset = 0
# print('#chains=',len(chains))
for c in chains:
# Could partition, but I want variable names
chunk = stripped[c[0]-offset:c[1]-offset]
head = stripped[:c[0]-offset]
tail = stripped[c[1]-offset:]
# print(chunk)
# print(len(head), len(tail))
lh, lt = len(head), len(tail)
# print(lh, lt, lh-lt)
tagsRemoved = chunk.count(chr(1))
spaceRemoved = chunk.count(' ')
offset += len(chunk) - tagsRemoved
spaces = (spaces[head.count(' '):]
+ spaces[head.count(' ')
+ spaceRemoved:])
spaces = (spaces[:head.count(' ')]
+ spaces[head.count(' ') + spaceRemoved:])
stripped = head + chr(1)*tagsRemoved + tail
# print("excised")
print("excised")
for s in spaces:
stripped = stripped.replace(' ',s,1)
for t in tags:
stripped = stripped.replace(chr(1),t,1)
# print("replaced")
print("replaced")
return container[0] + stripped + container[1]
\ No newline at end of file
......@@ -142,63 +142,51 @@ Valid operators are: '&amp;', '&amp;&amp;', '|', '||', 'and', 'or'. Use 'not' or
<property name="bottomMargin">
<number>6</number>
</property>
<item row="7" column="1">
<widget class="QCheckBox" name="formatCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will paste emails with all formatting cleared.</string>
</property>
<item row="0" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string/>
<string>Honor Context</string>
</property>
</widget>
</item>
<item row="2" column="1">
<widget class="QCheckBox" name="quoteCheck">
<item row="0" column="1">
<widget class="QCheckBox" name="contextCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will attempt to remove redundant quotes in email replies.</string>
<string>Setting this to &lt;b&gt;true&lt;/b&gt; automatically adds 'SINCE' and 'BEFORE' filters when appropriate, according to the context.</string>
</property>
<property name="text">
<string/>
<property name="checked">
<bool>false</bool>
</property>
</widget>
</item>
<item row="8" column="0">
<widget class="QLabel" name="label_6">
<item row="1" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>Message Delimiter</string>
<string>Add Headers</string>
</property>
</widget>
</item>
<item row="8" column="1">
<widget class="QLineEdit" name="delimeterEdit">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Fixed">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="toolTip">
<string>Delimiter to appear between emails. HTML will appear as HTML in the generated document.</string>
</property>
<property name="placeholderText">
<string>No Delimiter</string>
</property>
<property name="clearButtonEnabled">
<bool>true</bool>
</property>
</widget>
<item row="1" column="1">
<widget class="QCheckBox" name="infoCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will include the header info of the email.</string>
</property>
</widget>
</item>
<item row="6" column="1">
<widget class="QCheckBox" name="imageCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will ignore images in emails.</string>
</property>
<item row="2" column="0">
<widget class="QLabel" name="label_34">
<property name="text">
<string/>
<string>Ignore Email Headers</string>
</property>
</widget>
</item>
<item row="1" column="1">
<item row="2" column="1">
<widget class="QCheckBox" name="headerCheck">
<property name="toolTip">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Setting this to &lt;span style=&quot; font-weight:600;&quot;&gt;true&lt;/span&gt; will attempt to remove email header fields such as:&lt;/p&gt;&lt;p&gt;&lt;span style=&quot; font-weight:600;&quot;&gt;From:&lt;/span&gt; John Doe&lt;/p&gt;&lt;p&gt;&lt;span style=&quot; font-weight:600;&quot;&gt;To:&lt;/span&gt; Jane Doe&lt;/p&gt;&lt;p&gt;&lt;span style=&quot; font-weight:600;&quot;&gt;Subject:&lt;/span&gt; Re: Pizza Dough&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
......@@ -208,100 +196,63 @@ Valid operators are: '&amp;', '&amp;&amp;', '|', '||', 'and', 'or'. Use 'not' or
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="label_3">
<property name="enabled">
<bool>true</bool>
</property>
<property name="font">
<font>
<strikeout>false</strikeout>
</font>
</property>
<widget class="QLabel" name="label_33">
<property name="text">
<string>Ignore Duplicate Text</string>
<string>Ignore Pleasantries</string>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>Honor Context</string>
<item row="3" column="1">
<widget class="QCheckBox" name="plesCheck">
<property name="toolTip">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Setting this to &lt;span style=&quot; font-weight:600;&quot;&gt;true&lt;/span&gt; will attempt to remove pleasantries such as:&lt;/p&gt;&lt;p&gt;&lt;span style=&quot; font-weight:600;&quot;&gt;Hi John Doe&lt;/p&gt;&lt;p&gt;&lt;span style=&quot; font-weight:600;&quot;&gt;To:&lt;/span&gt; Jane Doe&lt;/p&gt;&lt;p&gt;&lt;span style=&quot; font-weight:600;&quot;&gt;Subject:&lt;/span&gt; Re: Pizza Dough&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
</widget>
</item>
<item row="6" column="0">
<widget class="QLabel" name="label_4">
<property name="text">
<string>Ignore Images</string>
<string/>
</property>
</widget>
</item>
<item row="9" column="0">
<spacer name="verticalSpacer_2">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
<item row="2" column="0">
<item row="4" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>Ignore Quoted Replies</string>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QLabel" name="label_7">
<property name="text">
<string>Collapse Whitespace</string>
</property>
</widget>
</item>
<item row="7" column="0">
<widget class="QLabel" name="label_5">
<property name="text">
<string>Ignore Formatting</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_1">
<property name="text">
<string>Ignore Email Headers</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QCheckBox" name="contextCheck">
<item row="4" column="1">
<widget class="QCheckBox" name="quoteCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; automatically adds 'SINCE' and 'BEFORE' filters when appropriate, according to the context.</string>
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will attempt to remove redundant quotes in email replies.</string>
</property>
<property name="text">
<string/>
</property>
<property name="checked">
<bool>false</bool>
</property>
</widget>
</item>
<item row="4" column="1">
<widget class="QCheckBox" name="spaceCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will remove consecutive whitespace.</string>
<item row="5" column="0">
<widget class="QLabel" name="label_3">
<property name="enabled">
<bool>true</bool>
</property>
<property name="font">
<font>
<strikeout>false</strikeout>
</font>
</property>
<property name="text">
<string/>
<string>Ignore Duplicate Text</string>
</property>
</widget>
</item>
<item row="3" column="1">
<item row="5" column="1">
<widget class="QWidget" name="horizontalWidget" native="true">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Minimum">
......@@ -344,7 +295,7 @@ Valid operators are: '&amp;', '&amp;&amp;', '|', '||', 'and', 'or'. Use 'not' or
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will ignore the first passage of duplicate text found. This is similar to ignoring quoted replies.</string>
</property>
<property name="text">
<string>Ignore First</string>
<string>Keep First</string>
</property>
</widget>
</item>
......@@ -416,6 +367,103 @@ Valid operators are: '&amp;', '&amp;&amp;', '|', '||', 'and', 'or'. Use 'not' or
</layout>
</widget>
</item>
<item row="6" column="0">
<widget class="QLabel" name="label_7">
<property name="text">
<string>Collapse Whitespace</string>
</property>
</widget>
</item>
<item row="6" column="1">
<widget class="QCheckBox" name="spaceCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will remove consecutive whitespace.</string>
</property>
</widget>
</item>
<item row="7" column="0">
<widget class="QLabel" name="label_4">
<property name="text">
<string>Ignore Images</string>
</property>
</widget>
</item>
<item row="7" column="1">
<widget class="QCheckBox" name="imageCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will ignore images in emails.</string>
</property>
<property name="text">
<string/>
</property>
</widget>
</item>
<item row="8" column="0">
<widget class="QLabel" name="label_5">
<property name="text">
<string>Ignore Formatting</string>
</property>
</widget>
</item>
<item row="8" column="1">
<widget class="QCheckBox" name="formatCheck">
<property name="toolTip">
<string>Setting this to &lt;b&gt;true&lt;/b&gt; will paste emails with all formatting cleared.</string>
</property>
<property name="text">
<string/>
</property>
</widget>