eval_utils.py

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility function for nq evaluation."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import glob
from gzip import GzipFile
import json
import multiprocessing
from absl import flags
from absl import logging

flags.DEFINE_integer(
    "long_non_null_threshold",
    2,
    "Require this many non-null long answer annotations "
    "to count gold as containing a long answer.",
)
flags.DEFINE_integer(
    "short_non_null_threshold",
    2,
    "Require this many non-null short answer annotations "
    "to count gold as containing a short answer.",
)

FLAGS = flags.FLAGS

# A data structure for storing prediction and annotation.
# When a example has multiple annotations, multiple NQLabel will be used.
NQLabel = collections.namedtuple(
    "NQLabel",
    [
        "example_id",  # the unique id for each NQ example.
        "long_answer_span",  # A Span object for long answer.
        "short_answer_span_list",  # A list of Spans for short answer.
        #   Note that In NQ, the short answers
        #   do not need to be in a single span.
        "yes_no_answer",  # Indicate if the short answer is an yes/no answer
        #   The possible values are "yes", "no", "none".
        #   (case insensitive)
        #   If the field is "yes", short_answer_span_list
        #   should be empty or only contain null spans.
        "long_score",  # The prediction score for the long answer prediction.
        "short_score",  # The prediction score for the short answer prediction.
    ],
)


class Span(object):
    """A class for handling token and byte spans.

    The logic is:

    1) if both start_byte !=  -1 and end_byte != -1 then the span is defined
       by byte offsets
    2) else, if start_token != -1 and end_token != -1 then the span is define
       by token offsets
    3) else, this is a null span.

    Null spans means that there is no (long or short) answers.
    If your systems only care about token spans rather than byte spans, set all
    byte spans to -1.

  """

    def __init__(self, start_byte, end_byte, start_token_idx, end_token_idx):

        if (start_byte < 0 and end_byte >= 0) or (start_byte >= 0 and end_byte < 0):
            raise ValueError("Inconsistent Null Spans (Byte).")

        if (start_token_idx < 0 and end_token_idx >= 0) or (
            start_token_idx >= 0 and end_token_idx < 0
        ):
            raise ValueError("Inconsistent Null Spans (Token).")

        if start_byte >= 0 and end_byte >= 0 and start_byte >= end_byte:
            raise ValueError("Invalid byte spans (start_byte >= end_byte).")

        if (start_token_idx >= 0 and end_token_idx >= 0) and (
            start_token_idx >= end_token_idx
        ):
            raise ValueError("Invalid token spans (start_token_idx >= end_token_idx)")

        self.start_byte = start_byte
        self.end_byte = end_byte
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx

    def is_null_span(self):
        """A span is a null span if the start and end are both -1."""

        if (
            self.start_byte < 0
            and self.end_byte < 0
            and self.start_token_idx < 0
            and self.end_token_idx < 0
        ):
            return True
        return False

    def __str__(self):
        byte_str = "byte: [" + str(self.start_byte) + "," + str(self.end_byte) + ")"
        tok_str = (
            "tok: [" + str(self.start_token_idx) + "," + str(self.end_token_idx) + ")"
        )

        return byte_str + " " + tok_str

    def __repr__(self):
        return self.__str__()


def is_null_span_list(span_list):
    """Returns true iff all spans in span_list are null or span_list is empty."""
    if not span_list or all([span.is_null_span() for span in span_list]):
        return True
    return False


def nonnull_span_equal(span_a, span_b):
    """Given two spans, return if they are equal.

  Args:
    span_a: a Span object.
    span_b: a Span object.  Only compare non-null spans. First, if the bytes are
      not negative, compare byte offsets, Otherwise, compare token offsets.

  Returns:
    True or False
  """
    assert isinstance(span_a, Span)
    assert isinstance(span_b, Span)
    assert not span_a.is_null_span()
    assert not span_b.is_null_span()

    # if byte offsets are not negative, compare byte offsets
    if (span_a.start_byte >= 0 and span_a.end_byte >= 0) and (
        span_b.start_byte >= 0 and span_b.end_byte >= 0
    ):

        if (span_a.start_byte == span_b.start_byte) and (
            span_a.end_byte == span_b.end_byte
        ):
            return True

    # if token offsets are not negative, compare token offsets
    if (span_a.start_token_idx >= 0 and span_a.end_token_idx >= 0) and (
        span_b.start_token_idx >= 0 and span_b.end_token_idx >= 0
    ):

        if (span_a.start_token_idx == span_b.start_token_idx) and (
            span_a.end_token_idx == span_b.end_token_idx
        ):
            return True

    return False


def span_set_equal(gold_span_list, pred_span_list):
    """Make the spans are completely equal besides null spans."""

    gold_span_list = [span for span in gold_span_list if not span.is_null_span()]
    pred_span_list = [span for span in pred_span_list if not span.is_null_span()]

    for pspan in pred_span_list:
        # not finding pspan equal to any spans in gold_span_list
        if not any([nonnull_span_equal(pspan, gspan) for gspan in gold_span_list]):
            return False

    for gspan in gold_span_list:
        # not finding gspan equal to any spans in pred_span_list
        if not any([nonnull_span_equal(pspan, gspan) for pspan in pred_span_list]):
            return False

    return True


def gold_has_short_answer(gold_label_list):
    """Gets vote from multi-annotators for judging if there is a short answer."""

    #  We consider if there is a short answer if there is an short answer span or
    #  the yes/no answer is not none.
    gold_has_answer = (
        gold_label_list
        and sum(
            [
                (
                    (not is_null_span_list(label.short_answer_span_list))
                    or (label.yes_no_answer != "none")
                )
                for label in gold_label_list
            ]
        )
        >= FLAGS.short_non_null_threshold
    )

    return gold_has_answer


def gold_has_long_answer(gold_label_list):
    """Gets vote from multi-annotators for judging if there is a long answer."""

    gold_has_answer = gold_label_list and (
        sum(
            [
                not label.long_answer_span.is_null_span()  # long answer not null
                for label in gold_label_list  # for each annotator
            ]
        )
        >= FLAGS.long_non_null_threshold
    )

    return gold_has_answer


def read_prediction_json(predictions_path):
    """Read the prediction json with scores.

  Args:
    predictions_path: the path for the prediction json.

  Returns:
    A dictionary with key = example_id, value = NQInstancePrediction.

  """
    logging.info("Reading predictions from file: %s", format(predictions_path))
    with open(predictions_path, "r") as f:
        predictions = json.loads(f.read())

    nq_pred_dict = {}
    for single_prediction in predictions["predictions"]:

        if "long_answer" in single_prediction:
            long_span = Span(
                single_prediction["long_answer"]["start_byte"],
                single_prediction["long_answer"]["end_byte"],
                single_prediction["long_answer"]["start_token"],
                single_prediction["long_answer"]["end_token"],
            )
        else:
            long_span = Span(-1, -1, -1, -1)  # Span is null if not presented.

        short_span_list = []
        if "short_answers" in single_prediction:
            for short_item in single_prediction["short_answers"]:
                short_span_list.append(
                    Span(
                        short_item["start_byte"],
                        short_item["end_byte"],
                        short_item["start_token"],
                        short_item["end_token"],
                    )
                )

        yes_no_answer = "none"
        if "yes_no_answer" in single_prediction:
            yes_no_answer = single_prediction["yes_no_answer"].lower()
            if yes_no_answer not in ["yes", "no", "none"]:
                raise ValueError("Invalid yes_no_answer value in prediction")

            if yes_no_answer != "none" and not is_null_span_list(short_span_list):
                raise ValueError("yes/no prediction and short answers cannot coexist.")

        pred_item = NQLabel(
            example_id=single_prediction["example_id"],
            long_answer_span=long_span,
            short_answer_span_list=short_span_list,
            yes_no_answer=yes_no_answer,
            long_score=single_prediction["long_answer_score"],
            short_score=single_prediction["short_answers_score"],
        )

        nq_pred_dict[single_prediction["example_id"]] = pred_item

    return nq_pred_dict


def read_annotation_from_one_split(gzipped_input_file):
    """Read annotation from one split of file."""
    if isinstance(gzipped_input_file, str):
        gzipped_input_file = open(gzipped_input_file, "rb")
    logging.info("parsing %s ..... ", gzipped_input_file.name)
    annotation_dict = {}
    with GzipFile(fileobj=gzipped_input_file) as input_file:
        for line in input_file:
            json_example = json.loads(line)
            example_id = json_example["example_id"]

            # There are multiple annotations for one nq example.
            annotation_list = []

            for annotation in json_example["annotations"]:
                long_span_rec = annotation["long_answer"]
                long_span = Span(
                    long_span_rec["start_byte"],
                    long_span_rec["end_byte"],
                    long_span_rec["start_token"],
                    long_span_rec["end_token"],
                )

                short_span_list = []
                for short_span_rec in annotation["short_answers"]:
                    short_span = Span(
                        short_span_rec["start_byte"],
                        short_span_rec["end_byte"],
                        short_span_rec["start_token"],
                        short_span_rec["end_token"],
                    )
                    short_span_list.append(short_span)

                gold_label = NQLabel(
                    example_id=example_id,
                    long_answer_span=long_span,
                    short_answer_span_list=short_span_list,
                    long_score=0,
                    short_score=0,
                    yes_no_answer=annotation["yes_no_answer"].lower(),
                )

                annotation_list.append(gold_label)
            annotation_dict[example_id] = annotation_list

    return annotation_dict


def read_annotation(path_name, n_threads=10):
    """Read annotations with real multiple processes."""
    input_paths = glob.glob(path_name)
    pool = multiprocessing.Pool(n_threads)
    try:
        dict_list = pool.map(read_annotation_from_one_split, input_paths)
    finally:
        pool.close()
        pool.join()

    final_dict = {}
    for single_dict in dict_list:
        final_dict.update(single_dict)

    return final_dict