Commit c6587da2 authored by Cage, Marshall Andrew's avatar Cage, Marshall Andrew
Browse files

Alter glob mechanics

For now, glob items with line classes are sort of uninteligent.
Fixing this will probably keep til i add glob item serialisation.

- Add more comments to the email scrape plugin
- Fix a couple of bugs in email scrape plugin

There was a semantic bug for when emails were converted to
QTextDocuments that left out portions of the message
parent 5542e635
......@@ -6,6 +6,7 @@ import os
import tarfile
import tempfile
import importlib
import traceback
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
......@@ -595,8 +596,9 @@ class ARCTool(QMainWindow):
try:
r = self.document.addSection(s)
except BaseException as e:
# e = sys.exc_info()[2]
print('nope', e)
trace = sys.exc_info()[2]
print(*traceback.format_tb(trace))
r = -1
# else:
if r < 0:
......
......@@ -236,7 +236,7 @@ class Plugin(arcclasses.Plugin):
if self.igDup:
doc.setHtml(self.stripDuplicate(doc.toHtml()))
# print('duplicates stripped')
# Delimiter (before ples strip in case we accdntly lose a marker)
# Delimiter (before ples strip in case we accidentally lose a marker)
doc.setHtml(doc.toHtml().replace(chr(26),self.delim))
if self.addHeader:
text = doc.toHtml()
......@@ -447,7 +447,7 @@ class Plugin(arcclasses.Plugin):
# blockIndex = [
# len(blocks[x]) + len(breaks[x]) for x in range(len(breaks))
# ] + [len(blocks[-1])]
# print('block indicies', blockIndex)
# print('block indices', blockIndex)
# for i in range(1,len(blocks)):
# blockIndex[i] += blockIndex[i-1]
......@@ -495,7 +495,7 @@ class Plugin(arcclasses.Plugin):
return text
'''Loop through assosciated message IDs and remove their text from this
'''Loop through associated message IDs and remove their text from this
message. This might be able to be done earlier by just filtering out the
latest id when inserting the message, but determining which message id
is the right one in a reply is tough because no one follows the standard.
......@@ -558,7 +558,8 @@ class Plugin(arcclasses.Plugin):
return text
def getHeader(self, message):
text = '<br/><span><p style="text-align:center;font-weight:bold;">From '
text = ('<br/><span><p style="text-align:center;font-weight:bold;"'
'>From ')
# print(message.keys())
text += message.get('From')
text += ' on '
......@@ -612,7 +613,8 @@ class Plugin(arcclasses.Plugin):
globs[hsh] = []
# else:
# print('collision',hsh,
# ''.join(x[0].replace(chr(24),'') for x in _).replace(' ', ''),
# ''.join(x[0].replace(chr(24),'') for x in _)
#.replace(' ', ''),
# len(globs[hsh])+1)
globs[hsh].append((_[0][1],_[-1][2]))
_.pop(0)
......
......@@ -151,9 +151,8 @@ class Plugin(arcclasses.Plugin):
#override
def generate(self):
if not self.fetched:
r = self.makeRequest()
if r < 0:
return None
# r = self.makeRequest()
return None
doc = QTextDocument()
cursor = QTextCursor(doc)
......@@ -445,7 +444,9 @@ class Plugin(arcclasses.Plugin):
brush = QBrush(QColor('#00e34b'))
highlight.setBackground(brush)
for s in set:
print('e')
cursor.setPosition(s[0])
print('f')
cursor.setPosition(s[1],cursor.KeepAnchor)
cursor.mergeCharFormat(highlight)
cursor.endEditBlock()
......@@ -456,31 +457,47 @@ class Plugin(arcclasses.Plugin):
item = GlobItem()
for message in self.groups[g]:
doc = QTextDocument()
print('message in', g)
body = ''
for part in message.walk():
typ = part.get_content_type()
dis = part.get('Content-Disposition')
if dis != 'attachment':
if typ == 'text/plain':
body = ''
body += '<p>'
ptbody = ''
try:
body = part.get_payload(decode=True).decode('utf-8')
ptbody = (part.get_payload(decode=True)
.decode('utf-8'))
except UnicodeDecodeError:
body = quopri.decodestring(part.get_payload()).decode('utf-8')
doc.setPlainText(body)
ptbody = (quopri.decodestring(
part.get_payload()).decode('utf-8'))
ptbody = re.sub('\r?\n','<br/>',ptbody)
body += ptbody + '</p>'
# doc.setPlainText(body)
elif typ == 'text/html':
body = ''
try:
body = part.get_payload(decode=True).decode('utf-8')
body += (part.get_payload(decode=True)
.decode('utf-8'))
except UnicodeDecodeError:
body = quopri.decodestring(part.get_payload()).decode('utf-8')
doc.setHtml(body)
body += (quopri.decodestring(part.get_payload())
.decode('utf-8'))
doc.setHtml(body)
curs = []
for p in self.groups[g][message]:
curs.append(QTextCursor(doc))
print(p,doc.characterCount())
print('g')
curs[-1].setPosition(p[0])
print('h')
curs[-1].setPosition(p[1],curs[-1].KeepAnchor)
if curs[-1].selectedText() == '':
print('empty selection; popped')
curs.pop()
if len(curs) > 0:
print('adding globs')
item + Glob(*curs)
self.items[g] = item
......@@ -508,7 +525,9 @@ class Plugin(arcclasses.Plugin):
parts = [p for p in message.walk()]
i=0
for part in parts:
typ, dis = part.get_content_type(), part.get('Content-Disposition')
typ = part.get_content_type()
dis = part.get('Content-Disposition')
if part.is_multipart():
for sp in reversed([s for s in part.walk()]):
if sp not in parts:
......@@ -520,12 +539,14 @@ class Plugin(arcclasses.Plugin):
try:
body = part.get_payload(decode=True).decode('utf-8')
except UnicodeDecodeError:
body = quopri.decodestring(part.get_payload()).decode('utf-8')
body = (quopri.decodestring(part.get_payload())
.decode('utf-8'))
elif typ == 'text/plain' and dis == None and body == '':
try:
body = part.get_payload(decode=True).decode('utf-8')
except UnicodeDecodeError:
body = quopri.decodestring(part.get_payload()).decode('utf-8')
body = (quopri.decodestring(part.get_payload())
.decode('utf-8'))
elif typ[:typ.find('/')] == 'image' and self.igImages:
# Embed image
dis = dis.split(';',1)
......@@ -541,19 +562,21 @@ class Plugin(arcclasses.Plugin):
cur = doc.find('$%s$'%(group),0,
options=doc.FindCaseSensitively
)
# print('find $%s$: %s' %(group, 'FAIL' if cur.isNull() else 'OK'))
# print('find $%s$: %s' %(group, 'FAIL' if cur.isNull()
# else 'OK'))
print('loop', len(self.groups))
text, ok, perc = self.items[group].getText(ref)
# print('find starting at %d: %s %f' %(i,text,perc))
while not cur.isNull():
# Use the GlobItem to get text from the ref document
if not ok:
cur.insertHtml(
('<span style="color:red; font-weight:bold">%s '\
+ '[%0.1f%% certainty]</span>') %(text,perc*100))
('<span style="color:red; font-weight:bold">%s '
'[%0.1f%% certainty]</span>') %(text,perc*100))
elif perc < .6:
cur.insertHtml(
('<span style="color:red">%s [%0.1f%% certainty]<'\
+ '/span>') %(text,perc*100))
('<span style="color:red">%s [%0.1f%% certainty]<'
'/span>') %(text,perc*100))
else:
cur.insertText(text)
......
''' HEY DON'T USE DISCONNECTED GLOBS UNTIL I FIGURE OUT HOW TO ACCOUNT
''' HEY DON'T USE DISCONNECTED GLOBS UNTIL WE FIGURE OUT HOW TO ACCOUNT
FOR THEM IN THE GLOBITEM OKAYTHANKSBYE
If it isn't evident by now, I'm no expert on text classification or NLP,
so most of this is based purely on intuition. The general hope is to
somehow implement a method to match similar text without storing the
actual text or needing to retrieve the text. In this case, this was done
by aggregating and averaging certain features from the selected text.
The resultant numbers are then truncated as necessary and composited
into a 96 bit string/number for serialisation. I mean, you definitely
lose information with the truncate, but when the glob is reconstituted,
it uses word/block divisions to sort of make up for it. So I don't think
it's a terrible implementation. That being said, this is something that
probably requires months of research and someone has probably already
implemented something like this...
Also this doesn't look at word meaning or context, just position and length.
'''
from PyQt5.QtGui import QTextCursor
......@@ -17,8 +32,9 @@ class Glob(object):
self.wc = 1 # Word count
self.lc = 1 # Line count
self.dev = 0.0 # Deviation (stdev/mean)
self.num = '0'*12 # Unique number, independant of text. 96 bits
self.num = '0'*12 # Unique number, independent of text. 96 bits
if len(curs) > 0:
print(curs)
self.setValue(*curs)
def __str__(self):
......@@ -69,7 +85,7 @@ class Glob(object):
# 16 16 10 10 2 2 10 10 10 10
#sta end sta end cla sub cov wc lc dev
# Each of these is 16 bits
# Each variable is 16 bits
a = (self.sta[0] & 65535)
b = (self.end[0] & 65535)
......@@ -139,14 +155,14 @@ class Glob(object):
self.cla = 0 if l < .25 else 1 if l < .66 else 2
self.cov = len(self.val)/doc.characterCount()
self.wc = len(self.val.split())
self.lc = len(spl)
self.wc = len(self.val.split())
self.lc = len(spl)
# split the words
spl = self.val.split()
# store their lengths
les = [len(s) for s in spl]
# get the occurances of each length
# get the occurrences of each length
cou = dict((les.count(l),l) for l in set(les))
# mean of the lengths
......@@ -157,10 +173,10 @@ class Glob(object):
# find the mode of 'les'
mod = cou[max(cou)]
# likelihood any lenth won't be the mode
# likelihood any length won't be the mode
dif = (len(spl) - max(cou)) / len(spl)
# figure out how to incorporate the dif
# figure out how to incorporate the diff
self.dev = dev/mea
return self.calc()
......@@ -205,34 +221,37 @@ class GlobItem():
self.globs.append(g)
# These properties are the same as a single glob, just (mean,stdev)
# Note the temprary type switch
# Note the temporary type switch
self.sta[0] = sum(g.sta[0] for g in self.globs)//len(self.globs)
self.sta[0] = [ self.sta[0],
int(sqrt(sum((g.sta[0]-self.sta[0])**2 for g in \
int(sqrt(sum((g.sta[0]-self.sta[0])**2 for g in
self.globs)/len(self.globs)))]
self.sta[1] = sum(g.sta[1] for g in self.globs)/len(self.globs)
self.sta[1] = [ self.sta[1],
int(sqrt(sum((g.sta[1]-self.sta[1])**2 for g in \
int(sqrt(sum((g.sta[1]-self.sta[1])**2 for g in
self.globs)/len(self.globs)))]
self.end[0] = sum(g.end[0] for g in self.globs)//len(self.globs)
self.end[0] = [ self.end[0],
int(sqrt(sum((g.end[0]-self.end[0])**2 for g in \
int(sqrt(sum((g.end[0]-self.end[0])**2 for g in
self.globs)/len(self.globs)))]
self.end[1] = sum(g.end[1] for g in self.globs)/len(self.globs)
self.end[1] = [ self.end[1],
int(sqrt(sum((g.end[1]-self.end[1])**2 for g in \
int(sqrt(sum((g.end[1]-self.end[1])**2 for g in
self.globs)/len(self.globs)))]
cou = ( sum( 1 for g in self.globs if g.cla == 0),
sum( 1 for g in self.globs if g.cla == 1),
sum( 1 for g in self.globs if g.cla == 2))
self.cla = cou.index(max(cou))
print('class for glob set to',self.sub)
cou = ( sum( 1 for g in self.globs if g.sub == 0),
sum( 1 for g in self.globs if g.sub == 1),
sum( 1 for g in self.globs if g.sub == 2))
self.sub = cou.index(max(cou))
print('sub for glob set to',self.sub)
print([g.cov for g in self.globs])
self.cov = sum(g.cov for g in self.globs)/len(self.globs)
self.cov = (self.cov,sqrt(
sum((g.cov-self.cov)**2 for g in self.globs)/len(self.globs)))
......@@ -252,6 +271,7 @@ class GlobItem():
return len(self.globs)
def getText(self,doc,threshold=.3):
print('gt')
cur = QTextCursor(doc)
cer = 0.0 # certainty
cha = doc.characterCount()
......@@ -262,10 +282,12 @@ class GlobItem():
sta = self.sta[1][0]*cha # get start relative to this doc
if (sta >= self.sta[0][0]-self.sta[0][1] and
sta <= self.sta[0][0]+self.sta[0][1]):
print('aaa')
cur.setPosition(sta) # use relative if within expectations
cer += .1
else:
# use hard start. doesn't boost confidence
print('b')
cur.setPosition(self.sta[0][0])
inb = cur.blockNumber() # cursor's starting block
cur.movePosition(cur.StartOfWord)
......@@ -273,31 +295,36 @@ class GlobItem():
end = self.end[1][0]*cha
if (end >= self.end[0][0]-self.end[0][1] and
end <= self.end[0][0]+self.end[0][1]):
print('c')
cur.setPosition(end,cur.KeepAnchor)
cer += .1
else:
print('e')
cur.setPosition(self.end[0][0],cur.KeepAnchor)
cur.movePosition(cur.EndOfWord,cur.KeepAnchor)
print('flag a')
# Match word count based on line sub classification
if self.cla == 0:
# print("I'm a line")
print("I'm a line")
if self.sub == 0:
# print("I'm a short line")
while cur.blockNumber() > inb:
cur.movePosition(cur.PreviousBlock,cur.KeepAnchor)
cur.movePosition(cur.EndOfBlock,cur.KeepAnchor)
# print('b2',cur.selectedText(),cur.blockNumber(), inb, self.wc)
while len(cur.selectedText().split()) < self.wc[0]-self.wc[1]:
cur.movePosition(cur.NextWord,cur.KeepAnchor)
cur.movePosition(cur.EndOfWord,cur.KeepAnchor)
# print('a')
while len(cur.selectedText().split()) > self.wc[0]+self.wc[1]:
cur.movePosition(cur.PreviousWord,cur.KeepAnchor)
cur.movePosition(cur.EndOfWord,cur.KeepAnchor)
# print('b',cur.selectedText(),cur.blockNumber(), inb, self.wc)
print("I'm a short line")
# while cur.blockNumber() > inb:
# cur.movePosition(cur.PreviousBlock)
# cur.movePosition(cur.EndOfBlock,cur.KeepAnchor)
# print('b2',cur.selectedText(),
# cur.blockNumber(), inb, self.wc)
# while len(cur.selectedText().split()) < self.wc[0]-self.wc[1]:
# cur.movePosition(cur.NextWord)
# cur.movePosition(cur.EndOfWord,cur.KeepAnchor)
print('a',cur.selectedText(),
cur.blockNumber(), inb, self.wc)
# while len(cur.selectedText().split()) < self.wc[0]+self.wc[1]:
# cur.movePosition(cur.PreviousWord)
# cur.movePosition(cur.EndOfWord,cur.KeepAnchor)
# print('b',cur.selectedText(),
# cur.blockNumber(), inb, self.wc)
elif self.sub == 2:
# print("I'm a long line")
print("I'm a long line")
if cur.blockNumber() > inb:
cur.movePosition(cur.PreviousBlock,cur.KeepAnchor)
cur.movePosition(cur.EndOfBlock,cur.KeepAnchor)
......@@ -312,14 +339,18 @@ class GlobItem():
cur.movePosition(cur.EndOfBlock,cur.KeepAnchor)
# print('d')
print('flag b')
text = cur.selectedText() # go ahead and grab the text
# check coverage, word count, line count and deviation, etc. for more confidence
# check coverage, word count, line count and deviation,
# etc. for more confidence
cov = len(text)/cha
wc = len(text.split())
lc = len(text.splitlines())
if cur.selectionStart() <= cha*(1-self.cov[0]):
cer += .1 # boost confidence if there are enough characters for a match, probably
# boost confidence if there are enough characters for a match,
# ... probably
cer += .1
if cur.selectionEnd() <= cha*(1-self.cov[0]):
cer += .1
# print(cha,(sta/cha)-self.sta[1][0],self.sta[1][0])
......@@ -338,7 +369,7 @@ class GlobItem():
# line count
cer += (.08 / max(1,abs(lc-self.lc[0])
/ zerocheck(self.lc[1], self.lc[0] )))
cer += .08 # stdev of word lenth
cer += .08 # stdev of word length
self.findCommon()
for w in text.split(): # common words
if w in self.common:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment