Add -n to take intersection of relatives

The `-n` flag now takes the intersection of the relatives of the probands; that is, only those that all probands share. Without `-n`, the union is taken, which was the previous functionality.
allytrope · Jul 29, 2024 · 1ad6464 · 1ad6464
1 parent 28226ee
commit 1ad6464
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 ### Input Options
 | Arg | Description |
 | --- | --- |
-| `STDIN` or positional arg | TSV with columns: child, sire, and dam. |
+| `stdin` or positional arg | TSV with columns: child, sire, and dam. |
 
 
 ### Proband Options
@@ -26,6 +26,7 @@
 | `-b` | `--descendants` | Descendants only + self. |
 | `-d <int>` | `--degree <int>` | Maximum degree of relationship. |
 | `-m` | `--mates` | Keep mates. |
+| `-n` | `--intersection` | Takes the intersection of relatives from all probands. |
 | `-r <float>` | `--relationship-coefficient <float>` | Minimum coefficient of relationship. |
 
 ### Output Options
@@ -34,7 +35,7 @@
 | `-Ol` | list | One individual per line. |
 | `-Om` | matrix | Coefficients of relationship as a matrix. |
 | `-Op` | PLINK | Plink-style `.ped`. |
-| `-Ot` | TSV | Child, sire, and dam with tab-delimited columns. |
+| `-Ot` | TSV | Child, sire, and dam with tab-delimited columns. (default) |
 | `-Ow` | pairwise | Coefficients of relationship as a pairwise TSV. |
 
 
@@ -46,6 +47,9 @@ ped pedigree.tsv -p 111 -d 4
 # Specify multiple probands as a comma-delimited string
 ped pedigree.tsv -p 111,222,333 -d 4
 
+# Find all ancestors that are shared between individuals "111" and "222".
+ped pedigree.tsv -p 111,222 -an
+
 # Change trios file to PLINK-style file
 ped pedigree.tsv -Op
 ```
@@ -90,7 +94,7 @@ Does not need to be seekable, and so can also take a file through process substi
 
 #### `-p <probands>`
 A comma-delimited string of probands like so `-p 111,222,333`.
-`-p` is incomplatible with `-P`.
+`-p` is incompatible with `-P`.
 
 ### Filtering
 Filtering options explain how to filter down a individuals in relation to proband(s). Thus using any of these require either `-P <probands_file>` or `-p <probands>`.
@@ -114,6 +118,11 @@ Some example values:
 This flag will include mates of individuals in the subset that might have otherwise been filtered out. This step occurs after all other filtering.
 This option is useful for when using output to generate a plot.
 
+#### `-n`
+When this flag is included, filtering will take the intersection of relatives for each proband. Without this flag, it is instead the union that is taken.
+
+For example, `ped pedigree.tsv -p 111,222 -an ` will find all ancestors that are shared between individuals 111 and 222.
+
 #### `-r <float>`
 This option keeps only relatives with a coefficient of relationship greater than or equal to the specified float. While `-d <int>` keeps only the shorest path to determine degree, `-r <float>` sums the coefficients of all paths.
 

diff --git a/io.nim b/io.nim
@@ -86,24 +86,6 @@ proc write_list*(individuals: HashSet[Individual]) =
   for indiv in sequence:
     echo indiv.id
 
-# proc write_matrix*(individuals: HashSet[Individual]) =
-#   #[Write a matrix of relationship coefficients.]#
-#   let sequence = individuals.toSeq().sorted(cmp=cmpIndividuals)
-
-#   type
-#     Cell = object
-#       indiv1: Individual
-#       indiv2: Individual
-#       val: float
-
-#   var cells: seq[Cell]
-#   #var matrix: seq[individuals.len(), seq[individuals.len(), float]]
-
-#   for individual in individuals:
-
-
-
-
 proc write_plink*(individuals: HashSet[Individual]) =
   #[Write individuals to PLINK-style TSV.
   
@@ -181,7 +163,7 @@ proc write_tsv*(individuals: HashSet[Individual]) =
     else:
       dam_id = ""
 
-    # Remove if parents are missing and is already listed as a parent of another
+    # When parents are missing and is already listed as a parent of another, don't include
     if sire_id == "" and dam_id == "":
       block singleton:
         for individual in individuals:

diff --git a/ped.nim b/ped.nim
@@ -1,6 +1,6 @@
 import
   docopt,
-  std/[hashes, options, sets, strformat, strutils, tables, terminal],
+  std/[enumerate, hashes, options, sets, strformat, strutils, tables, terminal],
   io, relatives
 
 
@@ -19,6 +19,7 @@ Options:
                                                   a.k.a, the shortest-path distance.
   -f, --force-probands                            No error if proband is missing from pedigree.
   -m, --mates                                     Include mates.
+  -n, --intersection                              Find intersection of filterings on each proband (as opposed to the union).
   -O <format>                                     Can be "l" for list, "m" for matrix, "p" for PLINK-style TSV,
                                                   "t" for 3-columned TSV, or "w" for pairwise.
   -P <file>, --probands-file <file>               File containing one proband per line.
@@ -77,25 +78,53 @@ for indiv in proband_strings:
     else:
       quit fmt"ERROR: {indiv} not in pedigree."
 
+# Determine whether to take union or intersection of probands' relatives
+var set_operation = union[Individual]
+if args["--intersection"]:
+  set_operation = intersection[Individual]
+
+### Subsetting generalizations
+proc subset_kin(kin: proc) =
+  #[Find either union or intersection of relatives using a specific filtering method.]#
+  var combined_kin: HashSet[Individual]
+  for idx, proband in enumerate(probands):
+    if idx == 0:
+      combined_kin.incl(proband.kin())
+    else:
+      combined_kin = combined_kin.set_operation(proband.kin())
+  individuals = individuals.intersection(combined_kin)
+proc subset_kin(kin: proc, coefficient: float) =
+  #[Find either union or intersection of relatives using a specific filtering method.]#
+  var combined_kin: HashSet[Individual]
+  for idx, proband in enumerate(probands):
+    if idx == 0:
+      combined_kin.incl(kin(proband, coefficient))
+    else:
+      combined_kin = combined_kin.set_operation(kin(proband, coefficient))
+  individuals = individuals.intersection(combined_kin)
+proc subset_kin(kin: proc, coefficient: int) =
+  #[Find either union or intersection of relatives using a specific filtering method.]#
+  var combined_kin: HashSet[Individual]
+  for idx, proband in enumerate(probands):
+    if idx == 0:
+      combined_kin.incl(kin(proband, coefficient))
+    else:
+      combined_kin = combined_kin.set_operation(kin(proband, coefficient))
+  individuals = individuals.intersection(combined_kin)
+
 # Find relatives by filtering method
 if args["--degree"]:
-    let degree = to_int(parse_float($args["--degree"]))
-    individuals = individuals.intersection(relatives_by_degree(probands, degree))
+  let degree = to_int(parse_float($args["--degree"]))
+  subset_kin(relatives_by_degree, degree)
 if args["--relationship-coefficient"]:
-    let coefficient = parse_float($args["--relationship-coefficient"])
-    individuals = individuals.intersection(filter_relatives(probands, coefficient))
+  let coefficient = parse_float($args["--relationship-coefficient"])
+  subset_kin(filter_relatives, coefficient)
 
-# Optionally restrict to only ancestors or descendants (including proband(s))
+# Restrict to only ancestors or descendants (including proband(s))
 if args["--ancestors"]:
-  var combined_ancestors: HashSet[Individual]
-  for proband in probands:
-    combined_ancestors.incl(proband.ancestors())
-  individuals = individuals.intersection(combined_ancestors)
+  subset_kin(ancestors)
 if args["--descendants"]:
-  var combined_descendants: HashSet[Individual]
-  for proband in probands:
-    combined_descendants.incl(proband.descendants())
-  individuals = individuals.intersection(combined_descendants)
+  subset_kin(descendants)
 
 # Add back mates
 if args["--mates"]:

diff --git a/relatives.nim b/relatives.nim
@@ -70,7 +70,7 @@ proc descendants*(proband: Individual): HashSet[Individual] =
 
 
 # Find descendants
-proc relatives_by_degree*(probands: seq[Individual], degree: int): HashSet[Individual] =
+proc relatives_by_degree*(proband: Individual, degree: int): HashSet[Individual] =
   #[Find all relatives within a specified degree of relationship.
   
   Finds relatives with a shortest path less than or equal to `degree` in graph G,
@@ -114,8 +114,7 @@ proc relatives_by_degree*(probands: seq[Individual], degree: int): HashSet[Indiv
         depth_first_search(child, degree - 1)
 
   # Begin iterations
-  for proband in probands:
-    depth_first_search(proband, degree)
+  depth_first_search(proband, degree)
 
   return relatives
 
@@ -177,14 +176,13 @@ proc relatives_by_relationship*(proband: Individual, min_coefficient: float): Or
 
   return coefficients
 
-proc filter_relatives*(probands: seq[Individual], min_coefficient: float): HashSet[Individual] =
+proc filter_relatives*(proband: Individual, min_coefficient: float): HashSet[Individual] =
     #[Filter to only relatives at or above the minimum coefficient.]#
     var relatives: HashSet[Individual]
-    for proband in probands:
-      let coefficients = relatives_by_relationship(proband, min_coefficient)
-      for indiv, coefficient in coefficients:
-        if coefficient >= min_coefficient:
-          relatives.incl(indiv)
+    let coefficients = relatives_by_relationship(proband, min_coefficient)
+    for indiv, coefficient in coefficients:
+      if coefficient >= min_coefficient:
+        relatives.incl(indiv)
     return relatives
 
 proc find_coefficients*(proband: Individual): OrderedTable[Individual, float] =