aeye-lab · romajaku · Oct 25, 2024 · Oct 25, 2024
@@ -0,0 +1,322 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Quality measures tutorial\n",
+    "\n",
+    "This is a tutorial for creating quality measures from raw data (.asc). It will include the absolute values and percentage of data for different parameters (missing pupil data, eye-tracking data outside the experiment).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import polars as pl\n",
+    "\n",
+    "import pymovements as pm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2",
+   "metadata": {},
+   "source": [
+    "After importing some basic libraries let's load an example eyetracking file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "asc = \"ch1hr007.asc\"\n",
+    "\n",
+    "data, metadata = pm.gaze.from_asc(\n",
+    "    asc,\n",
+    "    patterns=[\n",
+    "        r\"start_recording_(?P<trial>(?:PRACTICE_)?trial_\\d+)_(?P<screen>.+)\",\n",
+    "        {\"pattern\": r\"stop_recording_\", \"column\": \"trial\", \"value\": None},\n",
+    "        {\"pattern\": r\"stop_recording_\", \"column\": \"screen\", \"value\": None},\n",
+    "        {\n",
+    "            \"pattern\": r\"start_recording_(?:PRACTICE_)?trial_\\d+_page_\\d+\",\n",
+    "            \"column\": \"activity\",\n",
+    "            \"value\": \"reading\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"pattern\": r\"start_recording_(?:PRACTICE_)?trial_\\d+_question_\\d+\",\n",
+    "            \"column\": \"activity\",\n",
+    "            \"value\": \"question\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"pattern\": r\"start_recording_(?:PRACTICE_)?trial_\\d+_(familiarity_rating_screen_\\d+|subject_difficulty_screen)\",\n",
+    "            \"column\": \"activity\",\n",
+    "            \"value\": \"rating\",\n",
+    "        },\n",
+    "        {\"pattern\": r\"stop_recording_\", \"column\": \"activity\", \"value\": None},\n",
+    "        {\n",
+    "            \"pattern\": r\"start_recording_PRACTICE_trial_\",\n",
+    "            \"column\": \"practice\",\n",
+    "            \"value\": True,\n",
+    "        },\n",
+    "        {\n",
+    "            \"pattern\": r\"start_recording_trial_\",\n",
+    "            \"column\": \"practice\",\n",
+    "            \"value\": False,\n",
+    "        },\n",
+    "        {\"pattern\": r\"stop_recording_\", \"column\": \"practice\", \"value\": None},\n",
+    "    ],\n",
+    ")\n",
+    "data.frame\n",
+    "\n",
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "We will split the pixel column into two for x and y coordinates of the gaze."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Unnest the 'pixel' column\n",
+    "\n",
+    "data.frame = data.frame.select(\n",
+    "    [\n",
+    "        pl.all().exclude(\"pixel\"),\n",
+    "        pl.col(\"pixel\").list.get(0).alias(\"pixel_x\"),\n",
+    "        pl.col(\"pixel\").list.get(1).alias(\"pixel_y\"),\n",
+    "    ]\n",
+    ")\n",
+    "data.frame"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6",
+   "metadata": {},
+   "source": [
+    "# Extracting quality measures"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    "The following function is meant to check if the sampling rate of the eyetracker ever deviated from the expected value.\n",
+    "It checks if consecutive timepoints ever differ by more than the value of the expected_diff argument. We're checking only the rows when a task definied by activity_id is performed.\n",
+    "An eyetracker with a constant refresh rate should return 0 skipped_time_absolute and a 0 skipped_time_ratio.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# function calculating for skipped time\n",
+    "\n",
+    "\n",
+    "def time_loss(df: pl.DataFrame, task_column: str = 'activity', activity_id: str = 'page',\n",
+    "              target_column: str = 'time', expected_diff: float = 0.5, tolerance: float = 1e-7):\n",
+    "    # Filter the DataFrame for rows where the 'activity' column contains the word 'page'\n",
+    "    filtered_df = df.filter(pl.col(task_column).str.contains(activity_id))\n",
+    "    # Calculate the difference between consecutive rows\n",
+    "    differences = filtered_df[target_column].diff().drop_nulls()\n",
+    "    # Store difference between timestep and expected_diff, where the difference is signifficant\n",
+    "    large_differences = differences.filter(differences > expected_diff) - expected_diff\n",
+    "    # total skipped time\n",
+    "    skipped_time_absolute = sum(large_differences)\n",
+    "    # ratio of skipped time to experiment duration\n",
+    "    total_duration = (df[target_column][len(df) - 1] - df[target_column][0])\n",
+    "    skipped_time_ratio = (skipped_time_absolute / total_duration)\n",
+    "    return skipped_time_absolute, skipped_time_ratio\n",
+    "\n",
+    "\n",
+    "time_loss(data.frame)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def missing_pupil(df, sampling_rate, pupil_col):\n",
+    "    miss_pupil_tuple = df[pupil_col].value_counts().row(by_predicate=(pl.col(pupil_col) == 0.0))\n",
+    "    abs_miss_pupil = miss_pupil_tuple[1] / sampling_rate\n",
+    "    per_miss_pupil = miss_pupil_tuple[1] / (df.height)\n",
+    "    return per_miss_pupil, abs_miss_pupil"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def missing_gaze(df, sampling_rate, gaze_x_col):\n",
+    "    abs_miss_gaze_x = data.frame.select(pl.col(gaze_x_col).is_null().sum()).item()\n",
+    "    per_miss_gaze_x = abs_miss_gaze_x / (df.height)\n",
+    "    abs_miss_gaze_x / sampling_rate\n",
+    "    return per_miss_gaze_x, abs_miss_gaze_x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def off_task_time(df, sampling_rate, data_col):\n",
+    "    null_values_tab = df.null_count()\n",
+    "    abs_miss_screen = null_values_tab[data_col][0]\n",
+    "    per_miss_screen = abs_miss_screen / (df.height)\n",
+    "    return abs_miss_screen / 1000, per_miss_screen"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12",
+   "metadata": {},
+   "source": [
+    "The following function gets the information about validation, specifically average value of all of the validations and maximal value of all of the validations. It is called in the next get_qual_check function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_validation_data(validations):\n",
+    "    sum_average = 0.0\n",
+    "    max_values = []\n",
+    "    for validation in validations:\n",
+    "        sum_average += float(validation['validation_score_avg'])\n",
+    "        max_values.append(float(validation['validation_score_max']))\n",
+    "    average_average = sum_average / len(validations)\n",
+    "    global_max = max(max_values)\n",
+    "    return average_average, global_max"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14",
+   "metadata": {},
+   "source": [
+    "The following function extracts certain signifficant quality measures from the metadata and makes use of the functions above to calculate its own measures. The result is saved as a csv file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_qual_check(\n",
+    "        df: pl.DataFrame,  # data frame with raw values\n",
+    "        metadata: dict,  # dictionary with metadata\n",
+    "        csv_name: str = 'out.csv',  # name of the output csv file, need to end with .csv\n",
+    "        pupil_col: str = \"pupil\",  # column in df where pupil data are stored\n",
+    "        data_col: str = 'screen',  # column in df where the screen activity is stored\n",
+    "        gaze_x_col: str = 'pixel_x',  # column in df with the gaze X coordinates\n",
+    "        trial_col: str = 'trial'):  # column in df with the trial runs\n",
+    "\n",
+    "    measures_dict = {}\n",
+    "    # check metadata values\n",
+    "    measures_dict['sampling_rate'] = metadata['sampling_rate']\n",
+    "    measures_dict['data_loss_ratio'] = metadata['data_loss_ratio']\n",
+    "    measures_dict['data_loss_ratio_blinks'] = metadata['data_loss_ratio_blinks']\n",
+    "    measures_dict['total_recording_duration_sec'] = metadata['total_recording_duration_ms'] / 1000\n",
+    "\n",
+    "    # Check amount of pupil omissions\n",
+    "    measures_dict['missing_pupil_ratio'], measures_dict['missing_pupil_sec'] = missing_pupil(\n",
+    "        df, measures_dict['sampling_rate'], pupil_col)\n",
+    "\n",
+    "    # Check amount of missing gaze data\n",
+    "    measures_dict['missing_gaze_ratio'], measures_dict['missing_gaze_sec'] = missing_gaze(\n",
+    "        df, measures_dict['sampling_rate'], gaze_x_col)\n",
+    "\n",
+    "    # Check the amount of time spent not on experimental tasks\n",
+    "\n",
+    "    measures_dict['off_task_time_sec'], measures_dict['off_task_time_ratio'] = off_task_time(\n",
+    "        df, metadata['sampling_rate'], data_col)\n",
+    "\n",
+    "    # Check the average quality of validation\n",
+    "    measures_dict['average_validation_score'], measures_dict['global_max_validation_score'] = get_validation_data(\n",
+    "        metadata['validations'])\n",
+    "\n",
+    "    # Check time loss\n",
+    "    measures_dict['time_loss_sec'] = time_loss(df)[0]\n",
+    "    measures_dict['time_loss_ratio'] = time_loss(df)[1]\n",
+    "\n",
+    "    # Divide data frame by trials\n",
+    "    list_of_trials_raw = data.frame.partition_by(by=trial_col)\n",
+    "    list_of_trials = [i for i in list_of_trials_raw if i.item(1, trial_col) is not None]\n",
+    "    i = 0\n",
+    "\n",
+    "    # Check the quality measures for separate trials\n",
+    "    for trial in list_of_trials:\n",
+    "        null_ratio_expr = pm.measure.measures.null_ratio(\"pixel_x\", pl.Float64)\n",
+    "        null_ratio = trial.select([null_ratio_expr]).item()\n",
+    "        trial_name = str(trial.item(1, trial_col))\n",
+    "        measures_dict[trial_name + '_null_ratio'] = null_ratio\n",
+    "        measures_dict[trial_name +\n",
+    "                      '_average_validation'] = metadata['validations'][i][\"validation_score_avg\"]\n",
+    "        measures_dict[trial_name +\n",
+    "                      '_max_validation'] = metadata['validations'][i][\"validation_score_max\"]\n",
+    "        measures_dict[trial_name + '_error'] = metadata['validations'][i][\"error\"]\n",
+    "        measures_dict[trial_name + '_tracked_eye'] = metadata['validations'][i][\"tracked_eye\"]\n",
+    "        i += 1\n",
+    "\n",
+    "    # Save measures in csv\n",
+    "    with open(csv_name, \"w\", newline=\"\") as f:\n",
+    "        w = csv.DictWriter(f, measures_dict.keys())\n",
+    "        w.writeheader()\n",
+    "        w.writerow(measures_dict)\n",
+    "    print(measures_dict)\n",
+    "\n",
+    "\n",
+    "get_qual_check(data.frame, metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}