add mantel test to iqtree stats

ktmeaton · Jun 2, 2021 · faafa25 · faafa25
1 parent fc321f4
commit faafa25
Show file tree

Hide file tree

Showing 3 changed files with 673 additions and 754 deletions.
diff --git a/workflow/envs/merge/environment.yaml b/workflow/envs/merge/environment.yaml
@@ -57,6 +57,7 @@ dependencies:
 - conda-forge::pymdown-extensions=7.1
 - conda-forge::python=3.7.3
 - conda-forge::r-ape
+- conda-forge::scikit-bio=0.5.6
 - conda-forge::xopen=0.9.0
 - contextily=1.0.1
 - descartes=1.1.0

diff --git a/workflow/notebooks/beast.py.ipynb b/workflow/notebooks/beast.py.ipynb
@@ -0,0 +1,245 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "proprietary-spice",
+   "metadata": {},
+   "source": [
+    "# BEAST Analysis Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gross-cutting",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "# 0. SETUP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "expanded-saudi",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "linear-burden",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ------------------------------------------\n",
+    "BRANCH_LIST = {\n",
+    "    \"0.PRE\": [\"0.PRE1\", \"0.PRE2\"], \n",
+    "    \"0.ANT4\" : [\"0.ANT4\"], \n",
+    "    \"0.PE\": [\"0.PE2\", \"0.PE4m\", \"0.PE4m\", \"0.PE4t\", \"0.PE4a\", \"0.PE5\", \"0.PE7\", \"0.PE8\", \"0.PE10\"],   \n",
+    "    \"0.ANT\": [\"0.ANT1\", \"0.ANT2\",\"0.ANT3\",\"0.ANT5\"],     \n",
+    "    \"1.PRE\" : [\"1.PRE0\",\"1.PRE1\", \"1.PRE2\", \"1.PRE3\"],    \n",
+    "    \"1.ANT\": [\"1.ANT1\"], \n",
+    "    \"1.IN\": [\"1.IN1\",\"1.IN2\",\"1.IN3\"],   \n",
+    "    \"1.ORI\" : [\"1.ORI1\", \"1.ORI2\", \"1.ORI3\"],\n",
+    "    \"2.ANT\": [\"2.ANT1\",\"2.ANT2\",\"2.ANT3\" ], \n",
+    "    \"2.MED\": [\"2.MED0\", \"2.MED1\",\"2.MED2\",\"2.MED3\" ], \n",
+    "    \"3.ANT\": [\"3.ANT1\", \"3.ANT2\" ],     \n",
+    "    \"4.ANT\": [\"4.ANT1\" ],           \n",
+    "}\n",
+    "\n",
+    "NUM_STATES = 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "mechanical-monte",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "# 1. IMPORT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "special-clause",
+   "metadata": {},
+   "source": [
+    "## Import Log Files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "jewish-australian",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.PRE\n",
+      "0.ANT4\n",
+      "0.PE\n",
+      "0.ANT\n",
+      "1.PRE\n",
+      "1.ANT\n",
+      "1.IN\n",
+      "1.ORI\n",
+      "2.ANT\n",
+      "2.MED\n",
+      "3.ANT\n",
+      "4.ANT\n"
+     ]
+    }
+   ],
+   "source": [
+    "log_dir = \"/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/beast/all/chromosome/clade\"\n",
+    "\n",
+    "# Construct a dictionary to hold the dataframes\n",
+    "log_dict = {branch:{\"strict\": {}, \"relaxed\": {}} for branch in BRANCH_LIST}\n",
+    "\n",
+    "for branch in BRANCH_LIST:\n",
+    "    print(branch)\n",
+    "    for filename in os.listdir(log_dir):\n",
+    "        filepath = os.path.join(log_dir, filename)\n",
+    "        if branch in filename:\n",
+    "            # Find Strict Clock log\n",
+    "            if \"SC\" in filename:\n",
+    "                strict_log = filepath\n",
+    "            # Find relaxed clock log\n",
+    "            elif \"UCLN\" in filename:\n",
+    "                relaxed_log = filepath\n",
+    "                \n",
+    "    # Add log files to dict\n",
+    "    log_dict[branch][\"strict\"][\"logfile\"] = strict_log\n",
+    "    log_dict[branch][\"relaxed\"][\"logfile\"] = relaxed_log"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aboriginal-intent",
+   "metadata": {},
+   "source": [
+    "## Initialize the Parameter data frame"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "absolute-receiver",
+   "metadata": {},
+   "source": [
+    "## Parse Log Files to DataFrames"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "lyric-alert",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.PRE strict\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "unsupported operand type(s) for +: 'dict' and 'dict'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-10-a924a7098954>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     29\u001b[0m                 \u001b[0mline_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcol\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mval\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msplit_line\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m                 \u001b[0;31m# add the clade and col values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m                 \u001b[0mparam_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"clade\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbranch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"clock\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mclock\u001b[0m\u001b[0;34m}\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mline_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     32\u001b[0m                 \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     33\u001b[0m         \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'dict' and 'dict'"
+     ]
+    }
+   ],
+   "source": [
+    "for branch in BRANCH_LIST:\n",
+    "    for clock in log_dict[branch]:\n",
+    "        print(branch, clock)\n",
+    "        log_filename = log_dict[branch][clock][\"logfile\"]\n",
+    "        with open(log_filename) as infile:\n",
+    "            read_file = infile.read().strip().split(\"\\n\")\n",
+    "            \n",
+    "            # Detect how many lines are headers with \"#\"\n",
+    "            for i in range (0, len(read_file)):\n",
+    "                # These are the column names\n",
+    "                if not read_file[i].startswith(\"#\"):\n",
+    "                    col_names = read_file[i].split(\"\\t\")\n",
+    "                    # Add branch and clock to col_names\n",
+    "                    col_names = [\"clade\",\"clock\"] + col_names\n",
+    "                    break\n",
+    "                    \n",
+    "            # Create dataframe from column names\n",
+    "            param_df = pd.DataFrame({col:[] for col in col_names})\n",
+    "                    \n",
+    "            # Remove the header lines\n",
+    "            read_file = read_file[i+1:]\n",
+    "            \n",
+    "            # Sample the desired number of states, from the end!\n",
+    "            sample_file = read_file[len(read_file)-NUM_STATES:]\n",
+    "\n",
+    "            # Parse values into dict\n",
+    "            for line in sample_file:\n",
+    "                split_line = line.split(\"\\t\")\n",
+    "                line_dict = {col:float(val) for col,val in zip(col_names, split_line)}\n",
+    "                # add the clade and col values\n",
+    "                param_dict = {\"clade\": branch, \"clock\" : clock} + line_dict\n",
+    "                print(param_dict)\n",
+    "        break\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "environmental-length",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = log_dict[\"0.PRE\"][\"strict\"][\"df\"]\n",
+    "\n",
+    "sns.kdeplot(x=df[\"meanRate\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "outdoor-advance",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/workflow/notebooks/iqtree_stats.py.ipynb b/workflow/notebooks/iqtree_stats.py.ipynb