-
Notifications
You must be signed in to change notification settings - Fork 0
/
e070_clinicos_en_miryprot.r.Rout
455 lines (363 loc) · 11 KB
/
e070_clinicos_en_miryprot.r.Rout
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
R version 3.1.2 (2014-10-31) -- "Pumpkin Helmet"
Copyright (C) 2014 The R Foundation for Statistical Computing
Platform: x86_64-unknown-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> ## d070_clinicos_en_miryprot.r
> ## 2015-03-26 julenmendieta92@gmail.com
> ##Modificado: 2015-05-21
> ## Script para generar una tabla con el contenido de los ficheros "clinical_patient" (No esta presente en todos los canceres)
> date ()
[1] "Tue May 26 14:44:01 2015"
> Sys.info ()[c("nodename", "user")]
nodename user
"crom01" "jmendieta"
> commandArgs ()
[1] "/opt/soft/libs/R/3.1.2_atlas/lib64/R/bin/exec/R"
[2] "-f"
[3] "e070_clinicos_en_miryprot.r"
[4] "--restore"
[5] "--save"
[6] "--no-readline"
> rm (list = ls ())
> R.version.string ##"R version 3.2.0 (2015-04-16)"
[1] "R version 3.1.2 (2014-10-31)"
>
> try (source (".job.r")); try (.job)
.job.r has been sourced
$name
[1] "tcga_download"
$dir
$dir$data
[1] "~/datos/2015/tcga_download"
$dir$code
[1] "~/trabajos/2015/tcga_download"
$dir$scripts
[1] "~/trabajos/2015/tcga_download/scripts"
$dir$docs
[1] "~/trabajos/2015/tcga_download/documents"
$dir$raw
[1] "~/datos/2015/tcga_download/data_raw"
$dir$annotation
[1] "~/datos/2015/tcga_download/data_annotation"
$dir$proces
[1] "~/datos/2015/tcga_download/data_processed"
$dir$plots
[1] "~/datos/2015/tcga_download/results/plots"
$dir$res
[1] "~/datos/2015/tcga_download/results/files"
$testmode
[1] FALSE
$dec
[1] "."
$idsep
[1] " /// "
>
> options (width = 170)
> #options (width = 1000)
>
> setwd (file.path (.job$dir$proces))
>
>
> #Ahora le añadimos los datos
>
> load (file.path (.job$dir$proces, "tabla_clinicos_muestra.RData")) #Con esto cargamamos datos
> load (file.path (.job$dir$proces, "miR_exp.RData")) #Con esto cargamos miR.exp.info y mir.exp
> load (file.path (.job$dir$proces, "prot_exp.RData")) #Con esto cargamos prot.exp.info y prot.exp
>
> tag <- names(miR.exp)
> #Primero dejamos solo al referencia al paciente en los barcodes de miR.exp
> for (t in tag) {
+ barcode <- colnames(miR.exp[[t]])
+ colnames(miR.exp[[t]]) <- sub("-...-....-..", "", barcode)
+ }
>
>
> ##Luego creamos dos columnas en datos donde incluiremos TRUE o FALSE dependiendo de que haya datos de esa muestra para miR o prot
> #Como no tenemos datos clinicos de todos los tags, cambiamos la variable tag a los tags de los que si tenemos estos datos
> tag <- names(datos)
> for (t in tag) {
+ datos[[t]][, c("prot.expr", "miR.expr")] <- FALSE
+ }
>
> for (t in tag) {
+ for (i in 1:nrow(datos[[t]])) {
+ if (datos[[t]][i, "bcr_sample_barcode"] %in% colnames(miR.exp[[t]]) == TRUE) {
+ datos[[t]][i, "miR.expr"] <- TRUE
+ }
+ if (datos[[t]][i, "bcr_shipment_portion_uuid"] %in% colnames(prot.exp[[t]]) == TRUE) {
+ datos[[t]][i, "prot.expr"] <- TRUE
+ }
+ }
+ print (t)
+ print(table(datos[[t]]["prot.expr"]==TRUE))
+ print(table(datos[[t]]["miR.expr"]==TRUE))
+ }
[1] "ACC"
TRUE
46
TRUE
46
[1] "BLCA"
TRUE
344
FALSE TRUE
2 342
[1] "BRCA"
FALSE TRUE
653 410
FALSE TRUE
342 721
[1] "CESC"
TRUE
173
TRUE
173
[1] "CHOL"
TRUE
30
TRUE
30
[1] "COAD"
FALSE TRUE
96 333
FALSE TRUE
205 224
[1] "DLBC"
TRUE
33
TRUE
33
[1] "ESCA"
TRUE
126
FALSE TRUE
1 125
[1] "GBM"
FALSE TRUE
30 214
FALSE
244
[1] "HNSC"
FALSE TRUE
145 212
FALSE TRUE
34 323
[1] "KICH"
TRUE
63
TRUE
63
[1] "KIRC"
FALSE TRUE
24 454
FALSE TRUE
241 237
[1] "KIRP"
TRUE
216
TRUE
216
[1] "LGG"
TRUE
435
FALSE TRUE
4 431
[1] "LIHC"
FALSE
184
FALSE TRUE
5 179
[1] "LUAD"
FALSE TRUE
184 181
FALSE TRUE
49 316
[1] "LUSC"
FALSE TRUE
133 195
FALSE TRUE
92 236
[1] "MESO"
TRUE
63
TRUE
63
[1] "OV"
FALSE TRUE
244 412
FALSE TRUE
119 537
[1] "PAAD"
TRUE
123
FALSE TRUE
7 116
[1] "PCPG"
TRUE
82
TRUE
82
[1] "PRAD"
TRUE
352
FALSE TRUE
3 349
[1] "READ"
FALSE TRUE
61 103
FALSE TRUE
90 74
[1] "SARC"
TRUE
227
FALSE TRUE
3 224
[1] "SKCM"
FALSE TRUE
149 206
FALSE TRUE
17 338
[1] "STAD"
TRUE
392
FALSE TRUE
82 310
[1] "TGCT"
TRUE
122
TRUE
122
[1] "THCA"
FALSE TRUE
5 376
FALSE TRUE
1 380
[1] "THYM"
TRUE
90
TRUE
90
[1] "UCEC"
FALSE
440
FALSE TRUE
116 324
[1] "UCS"
TRUE
48
TRUE
48
[1] "UVM"
TRUE
12
TRUE
12
> #Hasta aqui ya tenemos una tabla con todos los datos clínicos, el tipo de muestra, y que indica si tiene datos de expresión de miR y prot
>
> #Creamos una data.frame en la que los datos de cada tumor estan al mismo nivel
> tags <- names (datos)
> mat <- NULL
> for (ta in tags) {
+ dt <- datos[[ta]]
+ dt[,"tag"] <- ta
+ mat <- rbind (mat, dt)
+ }
> dim (mat)
[1] 8421 18
> mat[1:3,]
bcr_sample_barcode shipment_portion_bcr_aliquot_barcode bcr_shipment_portion_uuid patient sample_type bcr_patient_uuid gender
1 TCGA-OR-A5J2-01A TCGA-OR-A5J2-01A-21-A39K-20 F9762BBB-BCA0-4B54-A2C8-6F81A91DE22F TCGA-OR-A5J2 Primary Tumor 8E7C2E31-D085-4B75-A970-162526DD07A0 FEMALE
2 TCGA-OR-A5J3-01A TCGA-OR-A5J3-01A-21-A39K-20 21AB5BC5-D822-4C70-BAD0-1E390A55456E TCGA-OR-A5J3 Primary Tumor DFD687BC-6E69-42F7-AF94-D17FC150D1A1 FEMALE
3 TCGA-OR-A5J6-01A TCGA-OR-A5J6-01A-41-A39K-20 BBE6AF7B-BCDD-4179-94E2-2978AF7A03A4 TCGA-OR-A5J6 Primary Tumor C8898B42-B704-45A0-9829-144B98F416E0 FEMALE
race ethnicity history_other_malignancy history_neoadjuvant_treatment vital_status informed_consent_verified patient_id
1 WHITE HISPANIC OR LATINO No No Dead YES A5J2
2 WHITE HISPANIC OR LATINO No No Alive YES A5J3
3 BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO No No Alive YES A5J6
tissue_source_site prot.expr miR.expr tag
1 OR TRUE TRUE ACC
2 OR TRUE TRUE ACC
3 OR TRUE TRUE ACC
>
> summary (mat)
bcr_sample_barcode shipment_portion_bcr_aliquot_barcode bcr_shipment_portion_uuid patient sample_type bcr_patient_uuid gender
Length:8421 Length:8421 Length:8421 Length:8421 Length:8421 Length:8421 Length:8421
Class :character Class :character Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
race ethnicity history_other_malignancy history_neoadjuvant_treatment vital_status informed_consent_verified patient_id
Length:8421 Length:8421 Length:8421 Length:8421 Length:8421 Length:8421 Length:8421
Class :character Class :character Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
tissue_source_site prot.expr miR.expr tag
Length:8421 Mode :logical Mode :logical Length:8421
Class :character FALSE:2348 FALSE:1657 Class :character
Mode :character TRUE :6073 TRUE :6764 Mode :character
NA's :0 NA's :0
>
>
> #Eliminamos los pacientes que not engan datos de miR y prot
> touse <- mat[,"prot.expr"] & mat[, "miR.expr"]
> table (touse)
touse
FALSE TRUE
3571 4850
> mat <- mat[touse,]
>
> save (list = "mat", file = file.path (.job$dir$proces, "clinicos_miryprot.RData"))
>
> #Miramos los tipos de muestras tomadas, solo hay 45 de tejido normal.
> t (t (table (mat[,"sample_type"])))
[,1]
Additional - New Primary 6
Metastatic 177
Primary Tumor 4658
Recurrent Tumor 9
>
> #Vamos a buscar el cancer que tenga mas muestras con datos de miR y prot
> table(mat[,"tag"])
ACC BLCA BRCA CESC CHOL COAD DLBC ESCA HNSC KICH KIRC KIRP LGG LUAD LUSC MESO OV PAAD PCPG PRAD READ SARC SKCM STAD TGCT THCA THYM UCS UVM
46 342 162 173 30 193 33 125 180 63 213 216 431 146 105 63 333 116 82 349 64 224 204 310 122 375 90 48 12
> max(table(mat[,"tag"])) #En este caso vemos que es LGG
[1] 431
>
>
> # #Miramos a que tumor pertenecen las muestras de tejido sano
> # st <- mat[,"sample_type"] == "Solid Tissue Normal"
> # table (st)
> # mat[st,]
> # table (mat[st, "tag"])
> #
> # #Comprobamos que para un mismo paciente tengamos muestras tanto de tejido sano como enfermo.
> # pacientes <- mat[st, "patient"]
> # buenos<- mat[,"patient"] %in% pacientes
> # table (mat[buenos, "patient"])
> # table (mat[buenos, "sample_type"])
> #
> # mat[buenos, c("patient", "sample_type")]
> # matriz <- mat[buenos,]
>
>
>
> ###EXIT
> warnings ()
NULL
> sessionInfo ()
R version 3.1.2 (2014-10-31)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
> q ("no")
> proc.time()
user system elapsed
4.931 0.246 5.616