from __future__ import absolute_import, division, print_function import numpy as np import pytesseract from PIL import Image, ImageDraw, ImageFont import torch import argparse import glob import logging import os import random import shutil import numpy as np import torch from seqeval.metrics import ( classification_report, f1_score, precision_score, recall_score, ) from tensorboardX import SummaryWriter from torch.nn import CrossEntropyLoss from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from transformers import ( WEIGHTS_NAME, AdamW, BertConfig, BertForTokenClassification, BertTokenizer, RobertaConfig, RobertaForTokenClassification, RobertaTokenizer, get_linear_schedule_with_warmup, ) from layoutlm import FunsdDataset, LayoutlmConfig, LayoutlmForTokenClassification #from transformers import LayoutLMTokenizer #from transformers import LayoutLMForTokenClassification #from torch.utils.data import DataLoader, RandomSampler, SequentialSampler ALL_MODELS = sum( ( tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, LayoutlmConfig) ), (), ) MODEL_CLASSES = { "bert": (BertConfig, BertForTokenClassification, BertTokenizer), "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), "layoutlm": (LayoutlmConfig, LayoutlmForTokenClassification, BertTokenizer), } args = {'local_rank': -1, 'overwrite_cache': True, 'data_dir': '/datadrive/unilm/layoutlm/deprecated/examples/seq_labeling/invoice_data/', 'model_name_or_path':'/datadrive/unilm/layoutlm/deprecated/models/layoutLM-base-uncased/', 'max_seq_length': 512, 'model_type': 'layoutlm', 'config_name':'/datadrive/unilm/layoutlm/deprecated/models/layoutLM-base-uncased/config.json', 'cache_dir':False, 'tokenizer_name':""} # class to turn the keys of a dict into attributes (thanks Stackoverflow) class AttrDict(dict): def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self def get_labels(path): with open(path, "r") as f: labels = f.read().splitlines() if "O" not in labels: labels = ["O"] + labels return labels labels = get_labels("/datadrive/unilm/layoutlm/deprecated/examples/seq_labeling/invoice_data/labels.txt") num_labels = len(labels) label_map = {i: label for i, label in enumerate(labels)} print(label_map) pad_token_label_id = CrossEntropyLoss().ignore_index args = AttrDict(args) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) #tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased") tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) def model_load(PATH, num_labels): #model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=num_labels) model.load_state_dict(torch.load(PATH, map_location=device)) model.to(device) model.eval() return model def preprocess(image_path): image = Image.open(image_path) image = image.convert("RGB") width, height = image.size w_scale = 1000 / width h_scale = 1000 / height ocr_df = pytesseract.image_to_data(image, output_type='data.frame') print(ocr_df) ocr_df = ocr_df.dropna().assign(left_scaled=ocr_df.left * w_scale, width_scaled=ocr_df.width * w_scale, top_scaled=ocr_df.top * h_scale, height_scaled=ocr_df.height * h_scale, right_scaled=lambda x: x.left_scaled + x.width_scaled, bottom_scaled=lambda x: x.top_scaled + x.height_scaled) float_cols = ocr_df.select_dtypes('float').columns ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int) ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True) ocr_df = ocr_df.dropna().reset_index(drop=True) #print(ocr_df.columns) #print(ocr_df['text']) words = list(ocr_df.text) coordinates = ocr_df[['left', 'top', 'width', 'height']] actual_boxes = [] for idx, row in coordinates.iterrows(): x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box actual_boxes.append(actual_box) boxes = [] for box in actual_boxes: boxes.append(normalize_box(box, width, height)) return image, words, boxes, actual_boxes def normalize_box(box, width, height): return [ int(1000 * (box[0] / width)), int(1000 * (box[1] / height)), int(1000 * (box[2] / width)), int(1000 * (box[3] / height)), ] def convert_example_to_features(image, words, boxes, actual_boxes, tokenizer, args, cls_token_box=[0, 0, 0, 0], sep_token_box=[1000, 1000, 1000, 1000], pad_token_box=[0, 0, 0, 0]): width, height = image.size tokens = [] token_boxes = [] actual_bboxes = [] # we use an extra b because actual_boxes is already used token_actual_boxes = [] for word, box, actual_bbox in zip(words, boxes, actual_boxes): word_tokens = tokenizer.tokenize(word) tokens.extend(word_tokens) token_boxes.extend([box] * len(word_tokens)) actual_bboxes.extend([actual_bbox] * len(word_tokens)) token_actual_boxes.extend([actual_bbox] * len(word_tokens)) # Truncation: account for [CLS] and [SEP] with "- 2". special_tokens_count = 2 if len(tokens) > args.max_seq_length - special_tokens_count: tokens = tokens[: (args.max_seq_length - special_tokens_count)] token_boxes = token_boxes[: (args.max_seq_length - special_tokens_count)] actual_bboxes = actual_bboxes[: (args.max_seq_length - special_tokens_count)] token_actual_boxes = token_actual_boxes[: (args.max_seq_length - special_tokens_count)] # add [SEP] token, with corresponding token boxes and actual boxes tokens += [tokenizer.sep_token] token_boxes += [sep_token_box] actual_bboxes += [[0, 0, width, height]] token_actual_boxes += [[0, 0, width, height]] segment_ids = [0] * len(tokens) # next: [CLS] token tokens = [tokenizer.cls_token] + tokens token_boxes = [cls_token_box] + token_boxes actual_bboxes = [[0, 0, width, height]] + actual_bboxes token_actual_boxes = [[0, 0, width, height]] + token_actual_boxes segment_ids = [1] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding_length = args.max_seq_length - len(input_ids) input_ids += [tokenizer.pad_token_id] * padding_length input_mask += [0] * padding_length segment_ids += [tokenizer.pad_token_id] * padding_length token_boxes += [pad_token_box] * padding_length token_actual_boxes += [pad_token_box] * padding_length assert len(input_ids) == args.max_seq_length assert len(input_mask) == args.max_seq_length assert len(segment_ids) == args.max_seq_length # assert len(label_ids) == args.max_seq_length assert len(token_boxes) == args.max_seq_length assert len(token_actual_boxes) == args.max_seq_length return input_ids, input_mask, segment_ids, token_boxes, token_actual_boxes def convert_to_features(image, words, boxes, actual_boxes, model): input_ids, input_mask, segment_ids, token_boxes, token_actual_boxes = convert_example_to_features(image=image, words=words, boxes=boxes, actual_boxes=actual_boxes, tokenizer=tokenizer, args=args) #print(token_actual_boxes) print(tokenizer.decode(input_ids)) input_ids = torch.tensor(input_ids, device=device).unsqueeze(0) print(input_ids.shape) attention_mask = torch.tensor(input_mask, device=device).unsqueeze(0) print(attention_mask.shape) token_type_ids = torch.tensor(segment_ids, device=device).unsqueeze(0) bbox = torch.tensor(token_boxes, device=device).unsqueeze(0) print(bbox.shape) #model=model_load(model_path,num_labels) outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids) #print(len(outputs)) #print(outputs.argmax(-1)) #token_predictions = outputs.logits.argmax(-1).squeeze().tolist() # the predictions are at the token level logits = outputs[0] pred=logits.argmax(-1).squeeze().tolist() #pred =list(pred) #print('pred',pred) token_predictions = pred print(token_predictions,len(token_predictions)) word_level_predictions = [] # let's turn them into word level predictions final_boxes = [] for id, token_pred, box in zip(input_ids.squeeze().tolist(), token_predictions, token_actual_boxes): if (tokenizer.decode([id]).startswith("##")) or (id in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]): # skip prediction + bounding box continue else: word_level_predictions.append(token_pred) final_boxes.append(box) return word_level_predictions, final_boxes