Merge branch 'master' of github.com:mwydmuch/napkinXC

mwydmuch · Feb 2, 2021 · dc31543 · dc31543
2 parents a239c2c + 9a52b49
commit dc31543
Show file tree

Hide file tree

Showing 34 changed files with 708 additions and 553 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,10 +20,11 @@ nxc
 # Docs
 docs/*/*
 
-# Experiments
-/data
+# Experiments and examples
+data
 /models*
 /results*
+eurlex-model
 
 # Misc
 .idea

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@
 
 cmake_minimum_required(VERSION 3.12)
 project(napkinXC
-    VERSION 0.4.3
+    VERSION 0.5.0
     DESCRIPTION "Extremely simple and fast extreme multi-class and multi-label classifiers"
     HOMEPAGE_URL https://github.com/mwydmuch/napkinXC
     LANGUAGES C CXX)
@@ -42,8 +42,6 @@ configure_file(
     ${SRC_DIR}/version.h
 )
 
-
-
 # Gather napkinXC source files
 file(GLOB SOURCES
     ${SRC_DIR}/*.cpp

diff --git a/experiments/calculate_inv_ps.py b/experiments/calculate_inv_ps.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import sys
+import os
+
+napkinxc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../python")
+sys.path.append(napkinxc_path)
+
+from napkinxc.measures import *
+
+
+def load_true_file(filepath):
+    with open(filepath) as file:
+        Y = []
+        for i, line in enumerate(file):
+            if i == 0 and len(line.split(' ')) == 3:
+                continue
+            Y.append([int(y) for y in line.strip().split(' ', 1)[0].split(',') if ':' not in y])
+        return Y
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Requires true file and output as arguments!")
+        exit(1)
+
+    true_file = sys.argv[1]
+    true = load_true_file(sys.argv[1])
+
+    A = 0.55
+    B = 1.5
+
+    if '/wikiLSHTC/' in true_file:
+        A = 0.5
+        B = 0.4
+    elif '/amazon/' in true_file:
+        A = 0.6
+        B = 2.6
+
+    inv_ps = inverse_propensity(true, A=A, B=B)
+    with open(sys.argv[2], "w") as out:
+        for ip in inv_ps:
+            out.write("{}\n".format(ip))
diff --git a/experiments/evaluate.py b/experiments/evaluate.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+
+import sys
+import os
+
+napkinxc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../python")
+sys.path.append(napkinxc_path)
+
+from napkinxc.measures import *
+
+
+def load_true_file(filepath):
+    with open(filepath) as file:
+        Y = []
+        for i, line in enumerate(file):
+            if i == 0 and len(line.split(' ')) == 3:
+                continue
+            Y.append([int(y) for y in line.strip().split(' ', 1)[0].split(',') if ':' not in y])
+        return Y
+
+
+def load_pred_file(filepath):
+    with open(filepath) as file:
+        Y = []
+
+        def convert_y(y):
+            y = y.split(':')
+            if len(y) == 2:
+                return (int(y[0]), float(y[1]))
+            else:
+                return int(y)
+
+        for line in file:
+            Y.append([convert_y(y) for y in line.strip().split(' ')])
+        return Y
+
+
+def load_inv_ps_file(filepath):
+    with open(filepath) as file:
+        v = []
+        for line in file:
+            v.append(float(line.strip()))
+        return v
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Requires true and prediction files as arguments!")
+        exit(1)
+
+    true = load_true_file(sys.argv[1])
+    pred = load_pred_file(sys.argv[2])
+
+    inv_ps = None
+    if len(sys.argv) > 3:
+        inv_ps = load_inv_ps_file(sys.argv[3])
+
+    max_k = 10
+
+    r = precision_at_k(true, pred, k=max_k)
+    for k in range(max_k):
+        print("P@{}: {}".format(k + 1, r[k]))
+
+    r = recall_at_k(true, pred, k=max_k)
+    for k in range(max_k):
+        print("R@{}: {}".format(k + 1, r[k]))
+
+    r = coverage_at_k(true, pred, k=max_k)
+    for k in range(max_k):
+        print("C@{}: {}".format(k + 1, r[k]))
+
+    r = ndcg_at_k(true, pred, k=max_k)
+    for k in range(max_k):
+        print("nDCG@{}: {}".format(k + 1, r[k]))
+
+    if inv_ps is not None:
+        r = psprecision_at_k(true, pred, inv_ps=inv_ps, k=max_k)
+        for k in range(max_k):
+            print("PSP@{}: {}".format(k + 1, r[k]))
diff --git a/experiments/remap_libsvm.py b/experiments/remap_libsvm.py
@@ -10,10 +10,10 @@ def load_libsvm(file):
     Y = []
     with open(file) as f:
         for row in f:
-            yi, xi = row.split(' ', 1)
-            X.append(xi)
-            if len(yi):
-                Y.append(yi.split(','))
+            y, x = row.split(' ', 1)
+            X.append(x)
+            if len(y):
+                Y.append(y.split(','))
             else:
                 Y.append([])
 
@@ -23,10 +23,10 @@ def load_libsvm(file):
 
 def save_libsvm(X, Y, file):
     with open(file, "w") as f:
-        for xi, yi in zip(X, Y):
-            f.write(','.join([str(y) for y in sorted(yi)]))
+        for x, y in zip(X, Y):
+            f.write(','.join([str(y_i) for y_i in sorted(y)]))
             f.write(' ')
-            f.write(xi)
+            f.write(x)
 
 
 def remap_files(files, mapping):

diff --git a/experiments/test.sh b/experiments/test.sh
@@ -51,9 +51,9 @@ elif [[ -e "${DATASET_FILE}_train.txt" ]]; then
 elif [[ -e "${DATASET_FILE}.train" ]]; then
     TRAIN_FILE="${DATASET_FILE}.train"
     TEST_FILE="${DATASET_FILE}.test"
-elif [[ -e "${DATASET_FILE}_train.svm" ]]; then
-    TRAIN_FILE="${DATASET_FILE}_train.svm"
-    TEST_FILE="${DATASET_FILE}_test.svm"
+elif [[ -e "${DATASET_FILE}_train.libsvm" ]]; then
+    TRAIN_FILE="${DATASET_FILE}_train.libsvm"
+    TEST_FILE="${DATASET_FILE}_test.libsvm"
 fi
 
 # Build nxc
@@ -65,12 +65,23 @@ if [[ ! -e ${ROOT_DIR}/nxc ]]; then
     cd ${ROOT_DIR}/experiments
 fi
 
+# Calculate inverse propensity
+INV_PS_FILE="${DATASET_FILE}.inv_ps"
+if [[ ! -e $INV_PS_FILE ]]; then
+    python3 ${SCRIPT_DIR}/calculate_inv_ps.py $TRAIN_FILE $INV_PS_FILE
+fi
+
 # Train model
 TRAIN_RESULT_FILE=${MODEL}/train_results
 TRAIN_LOCK_FILE=${MODEL}/.train_lock
 if [[ ! -e $MODEL ]] || [[ -e $TRAIN_LOCK_FILE ]]; then
     mkdir -p $MODEL
     touch $TRAIN_LOCK_FILE
+
+    if [[ $TRAIN_ARGS == *"--labelsWeights"* ]]; then
+        TRAIN_ARGS="${TRAIN_ARGS} --labelsWeights ${INV_PS_FILE}"
+    fi
+
     (time ${ROOT_DIR}/nxc train -i $TRAIN_FILE -o $MODEL $TRAIN_ARGS | tee $TRAIN_RESULT_FILE)
     echo
     echo "Train date: $(date)" | tee -a $TRAIN_RESULT_FILE
@@ -86,7 +97,36 @@ if [[ ! -e $TEST_RESULT_FILE ]] || [[ -e $TEST_LOCK_FILE ]]; then
     if [ -e $TRAIN_RESULT_FILE ]; then
         cat $TRAIN_RESULT_FILE > $TEST_RESULT_FILE
     fi
-    (time ${ROOT_DIR}/nxc test -i $TEST_FILE -o $MODEL $TEST_ARGS | tee -a $TEST_RESULT_FILE)
+    #(time ${ROOT_DIR}/nxc test -i $TEST_FILE -o $MODEL $TEST_ARGS | tee -a $TEST_RESULT_FILE)
+
+    if [[ $TEST_ARGS == *"--labelsWeights"* ]]; then
+        TEST_ARGS="${TEST_ARGS} --labelsWeights ${INV_PS_FILE}"
+    fi
+
+    PRED_FILE=${MODEL}/test_pred_$(echo "${TEST_ARGS}" | tr " /" "__")
+    PRED_LOCK_FILE=${MODEL}/.test_pred_lock_$(echo "${TEST_ARGS}" | tr " /" "__")
+    if [[ ! -e $PRED_FILE ]] || [[ -e $PRED_LOCK_FILE ]]; then
+        touch $PRED_LOCK_FILE
+        ${ROOT_DIR}/nxc predict -i $TEST_FILE -o $MODEL $TEST_ARGS > $PRED_FILE
+        rm -rf $PRED_LOCK_FILE
+    fi
+
+    echo "Test file results:" | tee -a $TEST_RESULT_FILE
+    python3 ${SCRIPT_DIR}/evaluate.py $TEST_FILE $PRED_FILE $INV_PS_FILE | tee -a $TEST_RESULT_FILE
+
+    TEST_ON_TRAIN=0
+    if [[ "${TEST_ON_TRAIN}" != "0" ]]; then
+        PRED_FILE=${MODEL}/train_pred_$(echo "${TEST_ARGS}" | tr " /" "__")
+        PRED_LOCK_FILE=${MODEL}/.train_pred_lock_$(echo "${TEST_ARGS}" | tr " /" "__")
+        if [[ ! -e $PRED_FILE ]] || [[ -e $PRED_LOCK_FILE ]]; then
+            touch $PRED_LOCK_FILE
+            ${ROOT_DIR}/nxc predict -i $TRAIN_FILE -o $MODEL $TEST_ARGS > $PRED_FILE
+            rm -rf $PRED_LOCK_FILE
+        fi
+
+        echo "Train results file:" | tee -a $TEST_RESULT_FILE
+        python3 ${SCRIPT_DIR}/evaluate.py $TEST_FILE $PRED_FILE $INV_PS_FILE | tee -a $TEST_RESULT_FILE
+    fi
 
     echo
     echo "Model file size: $(du -ch ${MODEL} | tail -n 1 | grep -E '[0-9\.,]+[BMG]' -o)" | tee -a $TEST_RESULT_FILE

diff --git a/experiments/utils.py b/experiments/utils.py
diff --git a/python/napkinxc/_napkinxc/_napkinxc.cpp b/python/napkinxc/_napkinxc/_napkinxc.cpp
@@ -189,7 +189,17 @@ class CPPModel {
     }
 
     void unload(){
-        if(model->isLoaded()) model->unload();
+        if(model != nullptr && model->isLoaded()) model->unload();
+    }
+
+    void setThresholds(std::vector<double> thresholds){
+        load();
+        model->setThresholds(thresholds);
+    }
+
+    void setLabelsWeights(std::vector<double> weights){
+        load();
+        model->setLabelsWeights(weights);
     }
 
     std::vector<std::vector<int>> predict(py::object inputFeatures, int featuresDataType, int topK, double threshold){
@@ -209,28 +219,6 @@ class CPPModel {
         return pred;
     }
 
-    std::vector<std::vector<int>> predictWithThresholds(py::object inputFeatures, int featuresDataType, int topK, std::vector<double> thresholds){
-        auto predWithProba = predictProbaWithThresholds(inputFeatures, featuresDataType, topK, thresholds);
-        return dropProbaHelper(predWithProba);
-    }
-
-    std::vector<std::vector<std::pair<int, double>>> predictProbaWithThresholds(py::object inputFeatures, int featuresDataType, int topK, std::vector<double> thresholds){
-        std::vector<std::vector<Prediction>> pred;
-        runAsInterruptable([&] {
-            load();
-            SRMatrix<Feature> features;
-            readFeatureMatrix(features, inputFeatures, (InputDataType)featuresDataType);
-            args.printArgs("predict");
-
-            args.topK = topK;
-            model->setThresholds(thresholds);
-            pred = model->predictBatchWithThresholds(features, args);
-        });
-
-        // This is only safe because it's struct with two fields casted to pair, don't do this with tuples!
-        return reinterpret_cast<std::vector<std::vector<std::pair<int, double>>>&>(pred);
-    }
-
     std::vector<double> ofo(py::object inputFeatures, py::object inputLabels, int featuresDataType, int labelsDataType) {
         std::vector<double> thresholds;
         runAsInterruptable([&] {
@@ -501,12 +489,12 @@ PYBIND11_MODULE(_napkinxc, n) {
     .def("fit_on_file", &CPPModel::fitOnFile)
     .def("load", &CPPModel::load)
     .def("unload", &CPPModel::unload)
+    .def("set_thresholds", &CPPModel::setThresholds)
+    .def("set_labels_weights", &CPPModel::setLabelsWeights)
     .def("predict", &CPPModel::predict)
     .def("predict_proba", &CPPModel::predictProba)
     .def("predict_for_file", &CPPModel::predictForFile)
     .def("predict_proba_for_file", &CPPModel::predictProbaForFile)
-    .def("predict_with_thresholds", &CPPModel::predictWithThresholds)
-    .def("predict_proba_with_thresholds", &CPPModel::predictProbaWithThresholds)
     .def("ofo", &CPPModel::ofo)
     .def("test", &CPPModel::test)
     .def("test_on_file", &CPPModel::testOnFile)

diff --git a/python/napkinxc/measures.py b/python/napkinxc/measures.py
@@ -228,10 +228,11 @@ def inverse_propensity(Y, A=0.55, B=1.5):
     :return: ndarray with propensity scores for each label
     """
     if isinstance(Y, np.ndarray) or isinstance(Y, csr_matrix):
-        m = Y.shape[0]
+        n, m = Y.shape
         freqs = np.sum(Y, axis=0)
 
     elif all((isinstance(y, list) or isinstance(y, tuple)) for y in Y):
+        n = len(Y)
         m = max([max(y) for y in Y if len(y)])
         freqs = np.zeros(m + 1)
         for y in Y:
@@ -240,8 +241,8 @@ def inverse_propensity(Y, A=0.55, B=1.5):
     else:
         raise TypeError("Unsupported data type, should be Numpy matrix, Scipy sparse matrix or list of list of ints")
 
-    C = (log(m) - 1.0) * (B + 1) ** A
-    inv_ps = 1.0 + C * (freqs + B) ** -A
+    C = (log(n) - 1) * (B + 1) ** A
+    inv_ps = 1 + C * (freqs + B) ** -A
     return inv_ps
 
 
@@ -255,8 +256,8 @@ def psprecision_at_k(Y_true, Y_pred, inv_ps, k=5):
         Predicted labels provided as a matrix with scores or list of rankings as a list of labels or tuples of labels with scores (idx, score)..
         In the case of the matrix, the ranking will be calculated by sorting scores in descending order.
     :type Y_pred: ndarray, csr_matrix, list[list[int|str]], list[list[tuple[int|str, float]]
-    :param inv_ps: Propensity scores for each label.
-    :type inv_ps: ndarray, list
+    :param inv_ps: Propensity scores for each label. In case of text labels needs to be a dict.
+    :type inv_ps: ndarray, list, dict
     :param k: Calculate at places from 1 to k, defaults to 5
     :type k: int, optional
     :return: ndarray with values of PSP at 1-k places.
@@ -265,13 +266,27 @@ def psprecision_at_k(Y_true, Y_pred, inv_ps, k=5):
     Y_true = _get_Y_iterator(Y_true)
     Y_pred = _get_Y_iterator(Y_pred, ranking=True)
 
-    if not isinstance(inv_ps, np.ndarray):
+    def _get_top_ps_dict(t):
+        return np.array(sorted([inv_ps.get(t_i, 0) for t_i in t], reverse=True))
+
+    def _get_top_ps_np(t):
+        t = [t_i for t_i in t if t_i < inv_ps.shape[0]]
+        return -np.sort(-inv_ps[t])
+
+    if isinstance(inv_ps, dict):
+        _get_top_ps = _get_top_ps_dict
+    elif isinstance(inv_ps, list):
         inv_ps = np.array(inv_ps)
+        _get_top_ps = _get_top_ps_np
+    elif isinstance(inv_ps, np.ndarray):
+        _get_top_ps = _get_top_ps_np
+    else:
+        raise TypeError("Unsupported data type for inv_ps, should be Numpy vector (1d array), or list, or dict")
 
     sum = np.zeros(k)
     best_sum = np.zeros(k)
     for t, p in zip(Y_true, Y_pred):
-        top_ps = -np.sort(-inv_ps[t])
+        top_ps = _get_top_ps(t)
         psp_at_i = 0
         best_psp_at_i = 0
         for i, p_i in enumerate(p[:k]):
@@ -402,4 +417,4 @@ def _get_Y_iterator(Y, ranking=False):
         return _Y_list_iterator(Y)
 
     else:
-        raise TypeError("Unsupported data type, should be Numpy matrix, Scipy sparse matrix or list of list of ints")
+        raise TypeError("Unsupported data type, should be Numpy matrix (2d array), or Scipy CSR matrix, or list of list of ints")