-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
85 lines (63 loc) · 2.08 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import polars as pl
df = pl.DataFrame(
[
pl.Series("k", ['B', 'A'], dtype=pl.Utf8),
pl.Series("a", [[5], [1, 30]], dtype=pl.List(pl.Int64)),
pl.Series("b", [[7], [2, 40]], dtype=pl.List(pl.Int64)),
pl.Series("mask_2in1", [[True, True], [False, False]], dtype=pl.List(pl.Boolean)),
pl.Series("starts_2in1", [[0, 0], [None]], dtype=pl.List(pl.UInt32)),
pl.Series("ends_2in1", [[1, 1], [None]], dtype=pl.List(pl.UInt32)),
]
)
raise
df = pl.DataFrame({"starts": [0], "ends": [1]})
df2 = pl.DataFrame({"starts": [0], "ends": [1]})
import poranges as pf
print(df.interval.join(df2, on=("starts", "ends")).collect())
raise
import polars as pl
df = pl.DataFrame({"starts": [0, 80, 106, 5], "ends": [60, 290, 200, 107]})
raise
df = pl.DataFrame(
{
"chromosome": "chr1",
"top_right": [[0]],
"top_left": [[3]],
"bottom_left": [[6, 7]],
"bottom_right": [[4, 5]],
}
)
df.groupby("chromosome").agg(
pl.concat(
[
pl.col("top_left", "top_right").alias_map(lambda n: n.split("_")[1]),
pl.col("bottom_left", "bottom_right").alias_map(lambda n: n.split("_")[1]),
],
how="horizontal",
)
)
raise
# not so simple
# cannot just remove all but first tiles
df2 = pl.DataFrame({"starts": [32], "ends": [490]})
win_size = 100
def to_window(column):
return pl.col(column).sub(pl.col(column).mod(pl.lit(win_size)))
def all_overlapping_windows_list(start_col, end_col):
return pl.arange(to_window(start_col), to_window(end_col) + 1, win_size)
def one_row_per_overlapping_window(window_list_col):
return [
pl.exclude(window_list_col)
.repeat_by(pl.col(window_list_col).list.lengths())
.explode(),
pl.col(window_list_col).explode(),
]
print(
df.lazy()
.sort("starts", "ends")
.with_columns([all_overlapping_windows_list("starts", "ends").alias("windows")])
.select(one_row_per_overlapping_window("windows"))
.sort("windows")
.collect()
)
# print(g.groupby(pl.col("arange")).agg([pl.lit(1).alias("g")]).collect()