cta-observatory · moralejo · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/notebooks/explore_DL2.ipynb b/notebooks/explore_DL2.ipynb
@@ -15,6 +15,7 @@
     "from astropy.coordinates.erfa_astrom import ErfaAstromInterpolator, erfa_astrom\n",
     "from lstchain.reco.utils import get_effective_time, extract_source_position, compute_theta2\n",
     "from ctapipe.containers import EventType\n",
+    "import gc\n",
     "\n",
     "%matplotlib inline"
    ]
@@ -29,7 +30,9 @@
     "This notebooks opens a set of LST-1 DL2 files, divides the data in two subsets according to reconstructed energy, and computes (for certain gammaness cuts) the theta2 plots with respect to a direction specified by the user (the candidate source).\n",
     "\n",
     "The cuts (gammaness & theta2) used for computing significances are \"reasonable\" for a first attempt at source detection. The ones for the high-E subset are about optimal for sensitivity (at low zeniths, say below 40 deg, good observation conditions and a Crab-like spectrum). \n",
-    "For low-energies it is hard to say what \"optimal cuts\" would be - that is quite dependent on the source energy spectrum, and also more sensitive to zenith angle (via the energy threshold). Do **not** play with the cuts on a yet-undetected source! If custom optimization of cuts is necessary, that can be done on simulations (for an assumed spectrum) or on a confirmed and bright source."
+    "For low-energies it is hard to say what \"optimal cuts\" would be - that is quite dependent on the source energy spectrum, and also more sensitive to zenith angle (via the energy threshold). Do **not** play with the cuts on a yet-undetected source! If custom optimization of cuts is necessary, that can be done on simulations (for an assumed spectrum) or on a confirmed and bright source.\n",
+    "\n",
+    "NOTE: the notebook is quite slow (due mainly to data loading and coordinate transformations) and memory-hungry, so you *may* have trouble with datasets longer than several tens of hours. For very long datasets, if you have problems with this notebook, you can do the theta2 plots from DL3 FITS files (using Gammapy)."
    ]
   },
   {
@@ -47,10 +50,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = glob.glob(\"/fefs/aswg/workspace/abelardo.moralejo/Crab_winter_22to23/TMP/DL2/dl2_*.h5\")\n",
+    "dataset = glob.glob(\"/fefs/aswg/workspace/abelardo.moralejo/Crab_test_school_2024/DL2/dl2*.h5\")\n",
     "source_name = \"Crab\" \n",
     "# theta2 plots will be calculated w.r.t. this source (name must be known to astropy, \n",
-    "# and must be in the FoV for the selected dataset)"
+    "# and must be in the FoV for the selected dataset)\n",
+    "\n",
+    "lowest_gammaness = 0.3 # events with lower gammaness will be skipped. Just to save memory!"
    ]
   },
   {
@@ -76,46 +81,64 @@
     "t_eff = 0\n",
     "t_elapsed = 0\n",
     "\n",
+    "# In order to save memory we read only the necessary columns from the DL2 table.\n",
+    "# If you need to access other parameters, just add their names below.\n",
+    "needed_columns = ['dragon_time', 'alt_tel', 'az_tel',\n",
+    "                  'reco_src_x', 'reco_src_y', 'gammaness',\n",
+    "                  'intensity', 'reco_energy', 'event_type']\n",
+    "\n",
+    "\n",
     "for file in dataset:\n",
     "    print(file)\n",
     "    tb = pd.read_hdf(file, tablename)\n",
     "    lt, et = get_effective_time(tb)\n",
     "    t_eff += lt\n",
     "    t_elapsed += et\n",
-    "    dummy.append(tb)\n",
+    "\n",
+    "    # Reduce precision to save memory:\n",
+    "    for colname in needed_columns:\n",
+    "        if colname == 'dragon_time': # here we need float64, keep it.\n",
+    "            continue\n",
+    "        if tb[colname].dtype != 'float64':\n",
+    "            continue\n",
+    "        tb[colname] = tb[colname].astype('float32')\n",
+    "    \n",
+    "    dummy.append(tb[needed_columns][tb['gammaness']>lowest_gammaness])\n",
+    "    tb = None\n",
+    "    gc.collect() # free memory\n",
     "    \n",
     "table = pd.concat(dummy)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "78608dc7",
+   "id": "24313927",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import gc"
+    "dummy = None\n",
+    "gc.collect() # free memory (in case of long table)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "24313927",
+   "id": "orange-isolation",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dummy = None\n",
-    "gc.collect() # clean-up memory (in case of long table)"
+    "print('Number of events:', len(table))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "orange-isolation",
+   "id": "b45fead6-f79d-4cde-a66a-759351f57385",
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(table)"
+    "print(f'Size of table (KB): {table.__sizeof__()/1024:.0f}')"
    ]
   },
   {
@@ -148,7 +171,7 @@
     "# Set the cuts here. \n",
     "# We make two subsets: Ereco < 0.2 TeV and Ereco > 0.2 TeV\n",
     "\n",
-    "min_gammaness_cut = [0.5, 0.95]\n",
+    "min_gammaness_cut = [0.5, 0.95] # Note the table already has a prior cut gness>lowest_gammaness! (defined above)\n",
     "min_intensity_cut = [50, 50] # p.e.\n",
     "min_energy_cut = [0., 0.2] # TeV\n",
     "max_energy_cut = [0.2, 1e6] # TeV\n",
@@ -308,7 +331,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a0d9a0fb",
+   "id": "642d787c-ef5c-4c89-b878-f8c4678da896",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d86b88b5-6858-4092-b9d7-768cf1eca9c1",
    "metadata": {},
    "outputs": [],
    "source": []