import json class BoundingBox: def __init__(self, width, height, left, top): self._width = width self._height = height self._left = left self._top = top def __str__(self): return "width: {}, height: {}, left: {}, top: {}".format(self._width, self._height, self._left, self._top) @property def width(self): return self._width @property def height(self): return self._height @property def left(self): return self._left @property def top(self): return self._top class Polygon: def __init__(self, x, y): self._x = x self._y = y def __str__(self): return "x: {}, y: {}".format(self._x, self._y) @property def x(self): return self._x @property def y(self): return self._y class Geometry: def __init__(self, geometry): boundingBox = geometry["BoundingBox"] polygon = geometry["Polygon"] bb = BoundingBox(boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"]) pgs = [] for pg in polygon: pgs.append(Polygon(pg["X"], pg["Y"])) self._boundingBox = bb self._polygon = pgs def __str__(self): s = "BoundingBox: {}\n".format(str(self._boundingBox)) return s @property def boundingBox(self): return self._boundingBox @property def polygon(self): return self._polygon class Word: def __init__(self, block, blockMap): self._block = block self._confidence = block['Confidence'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._text = "" if(block['Text']): self._text = block['Text'] def __str__(self): return self._text @property def confidence(self): return self._confidence @property def geometry(self): return self._geometry @property def id(self): return self._id @property def text(self): return self._text @property def block(self): return self._block class Line: def __init__(self, block, blockMap): self._block = block self._confidence = block['Confidence'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._text = "" if(block['Text']): self._text = block['Text'] self._words = [] if('Relationships' in block and block['Relationships']): for rs in block['Relationships']: if(rs['Type'] == 'CHILD'): for cid in rs['Ids']: if(blockMap[cid]["BlockType"] == "WORD"): self._words.append(Word(blockMap[cid], blockMap)) def __str__(self): s = "Line\n==========\n" s = s + self._text + "\n" s = s + "Words\n----------\n" for word in self._words: s = s + "[{}]".format(str(word)) return s @property def confidence(self): return self._confidence @property def geometry(self): return self._geometry @property def id(self): return self._id @property def words(self): return self._words @property def text(self): return self._text @property def block(self): return self._block class SelectionElement: def __init__(self, block, blockMap): self._confidence = block['Confidence'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._selectionStatus = block['SelectionStatus'] @property def confidence(self): return self._confidence @property def geometry(self): return self._geometry @property def id(self): return self._id @property def selectionStatus(self): return self._selectionStatus class FieldKey: def __init__(self, block, children, blockMap): self._block = block self._confidence = block['Confidence'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._text = "" self._content = [] t = [] for eid in children: wb = blockMap[eid] if(wb['BlockType'] == "WORD"): w = Word(wb, blockMap) self._content.append(w) t.append(w.text) if(t): self._text = ' '.join(t) def __str__(self): return self._text @property def confidence(self): return self._confidence @property def geometry(self): return self._geometry @property def id(self): return self._id @property def content(self): return self._content @property def text(self): return self._text @property def block(self): return self._block class FieldValue: def __init__(self, block, children, blockMap): self._block = block self._confidence = block['Confidence'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._text = "" self._content = [] t = [] for eid in children: wb = blockMap[eid] if(wb['BlockType'] == "WORD"): w = Word(wb, blockMap) self._content.append(w) t.append(w.text) elif(wb['BlockType'] == "SELECTION_ELEMENT"): se = SelectionElement(wb, blockMap) self._content.append(se) self._text = se.selectionStatus if(t): self._text = ' '.join(t) def __str__(self): return self._text @property def confidence(self): return self._confidence @property def geometry(self): return self._geometry @property def id(self): return self._id @property def content(self): return self._content @property def text(self): return self._text @property def block(self): return self._block class Field: def __init__(self, block, blockMap): self._key = None self._value = None for item in block['Relationships']: if(item["Type"] == "CHILD"): self._key = FieldKey(block, item['Ids'], blockMap) elif(item["Type"] == "VALUE"): for eid in item['Ids']: vkvs = blockMap[eid] if 'VALUE' in vkvs['EntityTypes']: if('Relationships' in vkvs): for vitem in vkvs['Relationships']: if(vitem["Type"] == "CHILD"): self._value = FieldValue(vkvs, vitem['Ids'], blockMap) def __str__(self): s = "\nField\n==========\n" k = "" v = "" if(self._key): k = str(self._key) if(self._value): v = str(self._value) s = s + "Key: {}\nValue: {}".format(k, v) return s @property def key(self): return self._key @property def value(self): return self._value class Form: def __init__(self): self._fields = [] self._fieldsMap = {} def addField(self, field): self._fields.append(field) self._fieldsMap[field.key.text] = field def __str__(self): s = "" for field in self._fields: s = s + str(field) + "\n" return s @property def fields(self): return self._fields def getFieldByKey(self, key): field = None if(key in self._fieldsMap): field = self._fieldsMap[key] return field def searchFieldsByKey(self, key): searchKey = key.lower() results = [] for field in self._fields: if(field.key and searchKey in field.key.text.lower()): results.append(field) return results class Cell: def __init__(self, block, blockMap): self._block = block self._confidence = block['Confidence'] self._rowIndex = block['RowIndex'] self._columnIndex = block['ColumnIndex'] self._rowSpan = block['RowSpan'] self._columnSpan = block['ColumnSpan'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._content = [] self._text = "" if('Relationships' in block and block['Relationships']): for rs in block['Relationships']: if(rs['Type'] == 'CHILD'): for cid in rs['Ids']: blockType = blockMap[cid]["BlockType"] if(blockType == "WORD"): w = Word(blockMap[cid], blockMap) self._content.append(w) self._text = self._text + w.text + ' ' elif(blockType == "SELECTION_ELEMENT"): se = SelectionElement(blockMap[cid], blockMap) self._content.append(se) self._text = self._text + se.selectionStatus + ', ' def __str__(self): return self._text @property def confidence(self): return self._confidence @property def rowIndex(self): return self._rowIndex @property def columnIndex(self): return self._columnIndex @property def rowSpan(self): return self._rowSpan @property def columnSpan(self): return self._columnSpan @property def geometry(self): return self._geometry @property def id(self): return self._id @property def content(self): return self._content @property def text(self): return self._text @property def block(self): return self._block class Row: def __init__(self): self._cells = [] def __str__(self): s = "" for cell in self._cells: s = s + "[{}]".format(str(cell)) return s @property def cells(self): return self._cells class Table: def __init__(self, block, blockMap): self._block = block self._confidence = block['Confidence'] self._geometry = Geometry(block['Geometry']) self._id = block['Id'] self._rows = [] ri = 1 row = Row() cell = None if('Relationships' in block and block['Relationships']): for rs in block['Relationships']: if(rs['Type'] == 'CHILD'): for cid in rs['Ids']: cell = Cell(blockMap[cid], blockMap) if(cell.rowIndex > ri): self._rows.append(row) row = Row() ri = cell.rowIndex row.cells.append(cell) if(row and row.cells): self._rows.append(row) def __str__(self): s = "Table\n==========\n" for row in self._rows: s = s + "Row\n==========\n" s = s + str(row) + "\n" return s @property def confidence(self): return self._confidence @property def geometry(self): return self._geometry @property def id(self): return self._id @property def rows(self): return self._rows @property def block(self): return self._block class Page: def __init__(self, blocks, blockMap): self._blocks = blocks self._text = "" self._lines = [] self._form = Form() self._tables = [] self._content = [] self._parse(blockMap) def __str__(self): s = "Page\n==========\n" for item in self._content: s = s + str(item) + "\n" return s def _parse(self, blockMap): for item in self._blocks: if item["BlockType"] == "PAGE": self._geometry = Geometry(item['Geometry']) self._id = item['Id'] elif item["BlockType"] == "LINE": l = Line(item, blockMap) self._lines.append(l) self._content.append(l) self._text = self._text + l.text + '\n' elif item["BlockType"] == "TABLE": t = Table(item, blockMap) self._tables.append(t) self._content.append(t) elif item["BlockType"] == "KEY_VALUE_SET": if 'KEY' in item['EntityTypes']: f = Field(item, blockMap) if(f.key): self._form.addField(f) self._content.append(f) else: print("WARNING: Detected K/V where key does not have content. Excluding key from output.") print(f) print(item) def getLinesInReadingOrder(self): columns = [] lines = [] for item in self._lines: column_found=False for index, column in enumerate(columns): bbox_left = item.geometry.boundingBox.left bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2 column_centre = column['left'] + column['right']/2 if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right): #Bbox appears inside the column lines.append([index, item.text]) column_found=True break if not column_found: columns.append({'left':item.geometry.boundingBox.left, 'right':item.geometry.boundingBox.left + item.geometry.boundingBox.width}) lines.append([len(columns)-1, item.text]) lines.sort(key=lambda x: x[0]) return lines def getTextInReadingOrder(self): lines = self.getLinesInReadingOrder() text = "" for line in lines: text = text + line[1] + '\n' return text @property def blocks(self): return self._blocks @property def text(self): return self._text @property def lines(self): return self._lines @property def form(self): return self._form @property def tables(self): return self._tables @property def content(self): return self._content @property def geometry(self): return self._geometry @property def id(self): return self._id class Document: def __init__(self, responsePages): if(not isinstance(responsePages, list)): rps = [] rps.append(responsePages) responsePages = rps self._responsePages = responsePages self._pages = [] self._parse() def __str__(self): s = "\nDocument\n==========\n" for p in self._pages: s = s + str(p) + "\n\n" return s def _parseDocumentPagesAndBlockMap(self): blockMap = {} documentPages = [] documentPage = None for page in self._responsePages: for block in page['Blocks']: if('BlockType' in block and 'Id' in block): blockMap[block['Id']] = block if(block['BlockType'] == 'PAGE'): if(documentPage): documentPages.append({"Blocks" : documentPage}) documentPage = [] documentPage.append(block) else: documentPage.append(block) if(documentPage): documentPages.append({"Blocks" : documentPage}) return documentPages, blockMap def _parse(self): self._responseDocumentPages, self._blockMap = self._parseDocumentPagesAndBlockMap() for documentPage in self._responseDocumentPages: page = Page(documentPage["Blocks"], self._blockMap) self._pages.append(page) @property def blocks(self): return self._responsePages @property def pageBlocks(self): return self._responseDocumentPages @property def pages(self): return self._pages def getBlockById(self, blockId): block = None if(self._blockMap and blockId in self._blockMap): block = self._blockMap[blockId] return block