Skip to content

Commit

Permalink
Add -n to take intersection of relatives
Browse files Browse the repository at this point in the history
The `-n` flag now takes the intersection of the relatives of the probands; that is, only those that all probands share. Without `-n`, the union is taken, which was the previous functionality.
  • Loading branch information
allytrope committed Jul 29, 2024
1 parent 28226ee commit 1ad6464
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 45 deletions.
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
### Input Options
| Arg | Description |
| --- | --- |
| `STDIN` or positional arg | TSV with columns: child, sire, and dam. |
| `stdin` or positional arg | TSV with columns: child, sire, and dam. |


### Proband Options
Expand All @@ -26,6 +26,7 @@
| `-b` | `--descendants` | Descendants only + self. |
| `-d <int>` | `--degree <int>` | Maximum degree of relationship. |
| `-m` | `--mates` | Keep mates. |
| `-n` | `--intersection` | Takes the intersection of relatives from all probands. |
| `-r <float>` | `--relationship-coefficient <float>` | Minimum coefficient of relationship. |

### Output Options
Expand All @@ -34,7 +35,7 @@
| `-Ol` | list | One individual per line. |
| `-Om` | matrix | Coefficients of relationship as a matrix. |
| `-Op` | PLINK | Plink-style `.ped`. |
| `-Ot` | TSV | Child, sire, and dam with tab-delimited columns. |
| `-Ot` | TSV | Child, sire, and dam with tab-delimited columns. (default) |
| `-Ow` | pairwise | Coefficients of relationship as a pairwise TSV. |


Expand All @@ -46,6 +47,9 @@ ped pedigree.tsv -p 111 -d 4
# Specify multiple probands as a comma-delimited string
ped pedigree.tsv -p 111,222,333 -d 4
# Find all ancestors that are shared between individuals "111" and "222".
ped pedigree.tsv -p 111,222 -an
# Change trios file to PLINK-style file
ped pedigree.tsv -Op
```
Expand Down Expand Up @@ -90,7 +94,7 @@ Does not need to be seekable, and so can also take a file through process substi

#### `-p <probands>`
A comma-delimited string of probands like so `-p 111,222,333`.
`-p` is incomplatible with `-P`.
`-p` is incompatible with `-P`.

### Filtering
Filtering options explain how to filter down a individuals in relation to proband(s). Thus using any of these require either `-P <probands_file>` or `-p <probands>`.
Expand All @@ -114,6 +118,11 @@ Some example values:
This flag will include mates of individuals in the subset that might have otherwise been filtered out. This step occurs after all other filtering.
This option is useful for when using output to generate a plot.

#### `-n`
When this flag is included, filtering will take the intersection of relatives for each proband. Without this flag, it is instead the union that is taken.

For example, `ped pedigree.tsv -p 111,222 -an ` will find all ancestors that are shared between individuals 111 and 222.

#### `-r <float>`
This option keeps only relatives with a coefficient of relationship greater than or equal to the specified float. While `-d <int>` keeps only the shorest path to determine degree, `-r <float>` sums the coefficients of all paths.

Expand Down
20 changes: 1 addition & 19 deletions io.nim
Original file line number Diff line number Diff line change
Expand Up @@ -86,24 +86,6 @@ proc write_list*(individuals: HashSet[Individual]) =
for indiv in sequence:
echo indiv.id

# proc write_matrix*(individuals: HashSet[Individual]) =
# #[Write a matrix of relationship coefficients.]#
# let sequence = individuals.toSeq().sorted(cmp=cmpIndividuals)

# type
# Cell = object
# indiv1: Individual
# indiv2: Individual
# val: float

# var cells: seq[Cell]
# #var matrix: seq[individuals.len(), seq[individuals.len(), float]]

# for individual in individuals:




proc write_plink*(individuals: HashSet[Individual]) =
#[Write individuals to PLINK-style TSV.
Expand Down Expand Up @@ -181,7 +163,7 @@ proc write_tsv*(individuals: HashSet[Individual]) =
else:
dam_id = ""

# Remove if parents are missing and is already listed as a parent of another
# When parents are missing and is already listed as a parent of another, don't include
if sire_id == "" and dam_id == "":
block singleton:
for individual in individuals:
Expand Down
57 changes: 43 additions & 14 deletions ped.nim
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import
docopt,
std/[hashes, options, sets, strformat, strutils, tables, terminal],
std/[enumerate, hashes, options, sets, strformat, strutils, tables, terminal],
io, relatives


Expand All @@ -19,6 +19,7 @@ Options:
a.k.a, the shortest-path distance.
-f, --force-probands No error if proband is missing from pedigree.
-m, --mates Include mates.
-n, --intersection Find intersection of filterings on each proband (as opposed to the union).
-O <format> Can be "l" for list, "m" for matrix, "p" for PLINK-style TSV,
"t" for 3-columned TSV, or "w" for pairwise.
-P <file>, --probands-file <file> File containing one proband per line.
Expand Down Expand Up @@ -77,25 +78,53 @@ for indiv in proband_strings:
else:
quit fmt"ERROR: {indiv} not in pedigree."

# Determine whether to take union or intersection of probands' relatives
var set_operation = union[Individual]
if args["--intersection"]:
set_operation = intersection[Individual]

### Subsetting generalizations
proc subset_kin(kin: proc) =
#[Find either union or intersection of relatives using a specific filtering method.]#
var combined_kin: HashSet[Individual]
for idx, proband in enumerate(probands):
if idx == 0:
combined_kin.incl(proband.kin())
else:
combined_kin = combined_kin.set_operation(proband.kin())
individuals = individuals.intersection(combined_kin)
proc subset_kin(kin: proc, coefficient: float) =
#[Find either union or intersection of relatives using a specific filtering method.]#
var combined_kin: HashSet[Individual]
for idx, proband in enumerate(probands):
if idx == 0:
combined_kin.incl(kin(proband, coefficient))
else:
combined_kin = combined_kin.set_operation(kin(proband, coefficient))
individuals = individuals.intersection(combined_kin)
proc subset_kin(kin: proc, coefficient: int) =
#[Find either union or intersection of relatives using a specific filtering method.]#
var combined_kin: HashSet[Individual]
for idx, proband in enumerate(probands):
if idx == 0:
combined_kin.incl(kin(proband, coefficient))
else:
combined_kin = combined_kin.set_operation(kin(proband, coefficient))
individuals = individuals.intersection(combined_kin)

# Find relatives by filtering method
if args["--degree"]:
let degree = to_int(parse_float($args["--degree"]))
individuals = individuals.intersection(relatives_by_degree(probands, degree))
let degree = to_int(parse_float($args["--degree"]))
subset_kin(relatives_by_degree, degree)
if args["--relationship-coefficient"]:
let coefficient = parse_float($args["--relationship-coefficient"])
individuals = individuals.intersection(filter_relatives(probands, coefficient))
let coefficient = parse_float($args["--relationship-coefficient"])
subset_kin(filter_relatives, coefficient)

# Optionally restrict to only ancestors or descendants (including proband(s))
# Restrict to only ancestors or descendants (including proband(s))
if args["--ancestors"]:
var combined_ancestors: HashSet[Individual]
for proband in probands:
combined_ancestors.incl(proband.ancestors())
individuals = individuals.intersection(combined_ancestors)
subset_kin(ancestors)
if args["--descendants"]:
var combined_descendants: HashSet[Individual]
for proband in probands:
combined_descendants.incl(proband.descendants())
individuals = individuals.intersection(combined_descendants)
subset_kin(descendants)

# Add back mates
if args["--mates"]:
Expand Down
16 changes: 7 additions & 9 deletions relatives.nim
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ proc descendants*(proband: Individual): HashSet[Individual] =


# Find descendants
proc relatives_by_degree*(probands: seq[Individual], degree: int): HashSet[Individual] =
proc relatives_by_degree*(proband: Individual, degree: int): HashSet[Individual] =
#[Find all relatives within a specified degree of relationship.
Finds relatives with a shortest path less than or equal to `degree` in graph G,
Expand Down Expand Up @@ -114,8 +114,7 @@ proc relatives_by_degree*(probands: seq[Individual], degree: int): HashSet[Indiv
depth_first_search(child, degree - 1)

# Begin iterations
for proband in probands:
depth_first_search(proband, degree)
depth_first_search(proband, degree)

return relatives

Expand Down Expand Up @@ -177,14 +176,13 @@ proc relatives_by_relationship*(proband: Individual, min_coefficient: float): Or

return coefficients

proc filter_relatives*(probands: seq[Individual], min_coefficient: float): HashSet[Individual] =
proc filter_relatives*(proband: Individual, min_coefficient: float): HashSet[Individual] =
#[Filter to only relatives at or above the minimum coefficient.]#
var relatives: HashSet[Individual]
for proband in probands:
let coefficients = relatives_by_relationship(proband, min_coefficient)
for indiv, coefficient in coefficients:
if coefficient >= min_coefficient:
relatives.incl(indiv)
let coefficients = relatives_by_relationship(proband, min_coefficient)
for indiv, coefficient in coefficients:
if coefficient >= min_coefficient:
relatives.incl(indiv)
return relatives

proc find_coefficients*(proband: Individual): OrderedTable[Individual, float] =
Expand Down

0 comments on commit 1ad6464

Please sign in to comment.