-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02-03-public_databases.html
571 lines (488 loc) · 29.4 KB
/
02-03-public_databases.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="author" content="Morgan Feeney, Leighton Pritchard" />
<title>How to Access Data from Public Databases</title>
<script src="05-public_databases_files/header-attrs-2.10/header-attrs.js"></script>
<script src="05-public_databases_files/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="05-public_databases_files/bootstrap-3.3.5/css/lumen.min.css" rel="stylesheet" />
<script src="05-public_databases_files/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="05-public_databases_files/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="05-public_databases_files/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
h1.title {font-size: 38px;}
h2 {font-size: 30px;}
h3 {font-size: 24px;}
h4 {font-size: 18px;}
h5 {font-size: 16px;}
h6 {font-size: 12px;}
code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
pre:not([class]) { background-color: white }</style>
<script src="05-public_databases_files/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="05-public_databases_files/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="05-public_databases_files/tocify-1.9.1/jquery.tocify.js"></script>
<script src="05-public_databases_files/navigation-1.1/tabsets.js"></script>
<link href="05-public_databases_files/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="05-public_databases_files/highlightjs-9.12.0/highlight.js"></script>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">code{white-space: pre;}</style>
<script type="text/javascript">
if (window.hljs) {
hljs.configure({languages: []});
hljs.initHighlightingOnLoad();
if (document.readyState && document.readyState === "complete") {
window.setTimeout(function() { hljs.initHighlighting(); }, 0);
}
}
</script>
<link rel="stylesheet" href="css/rmd_style.css" type="text/css" />
<style type = "text/css">
.main-container {
max-width: 940px;
margin-left: auto;
margin-right: auto;
}
img {
max-width:100%;
}
.tabbed-pane {
padding-top: 12px;
}
.html-widget {
margin-bottom: 20px;
}
button.code-folding-btn:focus {
outline: none;
}
summary {
display: list-item;
}
pre code {
padding: 0;
}
</style>
<!-- tabsets -->
<style type="text/css">
.tabset-dropdown > .nav-tabs {
display: inline-table;
max-height: 500px;
min-height: 44px;
overflow-y: auto;
border: 1px solid #ddd;
border-radius: 4px;
}
.tabset-dropdown > .nav-tabs > li.active:before {
content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
content: "";
border: none;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
border: none;
display: inline-block;
border-radius: 4px;
background-color: transparent;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
display: block;
float: none;
}
.tabset-dropdown > .nav-tabs > li {
display: none;
}
</style>
<!-- code folding -->
<style type="text/css">
#TOC {
margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
position: relative;
width: 100%;
}
}
@media print {
.toc-content {
/* see https://github.com/w3c/csswg-drafts/issues/4434 */
float: right;
}
}
.toc-content {
padding-left: 30px;
padding-right: 40px;
}
div.main-container {
max-width: 1200px;
}
div.tocify {
width: 20%;
max-width: 260px;
max-height: 85%;
}
@media (min-width: 768px) and (max-width: 991px) {
div.tocify {
width: 25%;
}
}
@media (max-width: 767px) {
div.tocify {
width: 100%;
max-width: none;
}
}
.tocify ul, .tocify li {
line-height: 20px;
}
.tocify-subheader .tocify-item {
font-size: 0.90em;
}
.tocify .list-group-item {
border-radius: 0px;
}
</style>
</head>
<body>
<div class="container-fluid main-container">
<!-- setup 3col/9col grid for toc_float and main content -->
<div class="row">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>
<div class="toc-content col-xs-12 col-sm-8 col-md-9">
<div id="header">
<h1 class="title toc-ignore">How to Access Data from Public Databases</h1>
<h4 class="author">Morgan Feeney, Leighton Pritchard</h4>
<h4 class="date">2021 Presentation</h4>
</div>
<div id="summary">
<ul>
<li>A wealth of information is freely available in public databases, and can be searched and accessed for use in your project.
<ul>
<li>Specialised databases are available for different purposes</li>
</ul></li>
<li>Each item of data deposited into a database is assigned a unique identifier called an <em>accession number</em>.</li>
<li>Deposition of research data in public repositories is often a requirement for publication</li>
<li>Databases are not equally authoritative: some are databases of record, and not all are curated for quality control and accuracy.</li>
</ul>
<p>This part of the workshop aims to introduce you to several ways of accessing data from public databases.</p>
</div>
<div id="introduction" class="section level1" number="1">
<h1><span class="header-section-number">1</span> Introduction</h1>
<p>Many, if not all, projects will involve the use of data from publicly available databases. You may be analysing these data as the main focus of your project, or comparing data you have generated with publicly available data (e.g. comparing the 16S rRNA gene sequence from an organism you isolated with 16S rRNA gene sequences deposited in the NCBI database.)</p>
<p>Databases provide an essential service in storing, organizing, and allowing researchers to easily search and access data. Most journals (certainly all reputable journals!) require researchers to deposit any data generated as part of a publication in the appropriate public database.</p>
<details>
<summary>
Some example journal requirements for data deposition (click to expand)
</summary>
<ul>
<li><a href="https://rupress.org/jem/pages/data-deposition">Journal of Experimental Medicine</a></li>
<li><a href="https://academic.oup.com/nar/pages/data_deposition_and_standardization">Nucleic Acids Research</a></li>
<li><a href="https://www.mdpi.com/journal/data/instructions#suppmaterials">MDPI Journals</a></li>
</ul>
</details>
</div>
<div id="where-can-we-get-public-data-from" class="section level1" number="2">
<h1><span class="header-section-number">2</span> Where can we get public data from?</h1>
<p>There are specialised databases for particular types of data. The databases that are relevant to you will depend on the specific task you are carrying out. For example, if you are trying to find the 3D structure of your protein of interest, the appropriate data will be stored in the <a href="https://www.rcsb.org/">Protein Data Bank</a>, but not in the <a href="https://www.ncbi.nlm.nih.gov/genbank/">NCBI Genbank</a> database (this database stores nucleotide and amino acid sequence information).</p>
<div id="note">
<p>Many public databases are connected by database <em>crosslinks</em> (also known as <em>xlinks</em>). Some of the most useful databases, such as <a href="https://www.ebi.ac.uk/uniprot/">UniProt</a>, are valuable not just because they contain useful data but because they <em>cross-link</em> between so many other databases to connect very different data types for easy <em>integration</em> of datasets.</p>
</div>
<p>Most databases can be accessed interactively through a web-based interface. Many can also be accessed <em>via</em> the command line, or <em>programmatically</em> (not covered in this workshop).</p>
<ul>
<li><a href="./database_list.html">Links to some key public databases</a></li>
</ul>
<div id="repositories-of-record-vs-domain-specific-curated-or-not-resources" class="section level2" number="2.1">
<h2><span class="header-section-number">2.1</span> Repositories of record vs domain-specific curated (or not) resources</h2>
<p>Primary databases usually provide experimentally-derived data, such as the 3D protein structures solved by X-ray crystallography, NMR or Cryo-EM, whose data are deposited at the PDB. Similarly, the <a href="https://www.ncbi.nlm.nih.gov/sra">Sequence Read Archive (SRA)</a> stores raw reads obtained from high throughput sequencing. and <a href="https://www.cancer.gov/about-nci/organization/ccg/research/structural-genomics/tcga">The Cancer Genome Atlas (TCGA)</a> provides access to genomic, transcriptomic, and clinical data for a range of cancers.</p>
<p>Both the PDB and SRA are <em>Repositories of Record</em>. In addition to being stores of experimentally-derived data, they are internationally recognised as the scientific community’s main repository for their datatypes. As such, journals may require that any suitable data is deposited in these repositories as a condition of publication. These kinds of repositories are required to be stable over decades, so are often funded at a national or international level.</p>
<p>There are a large number of subject- or organism-specific databases. These may contain primary data, but often contain data derived from analysis of primary databases, or integrate data from multiple sources to provide a comprehensive view of a particular topic. For example, the <a href="https://card.mcmaster.ca/">Comprehensive Antibiotic Resistance Database (CARD)</a> is a curated database of known resistance determinants and associated antibiotics. Curation can increase the reliability and trustworthiness of a database, as problematic data may be excluded and only well-evidenced data included. However, curation is time-consuming, and these databases may be smaller than comprehensive databases that have less quality control.</p>
<details>
<summary>
UniProt, Swiss-Prot and TrEMBL: Curated and Uncurated (click to expand)
</summary>
<p>The <code>UniProt</code> database comprises two main sequence databases under the name <code>UniProtKB</code> (UniProt Knowledgebase): <code>Swiss-Prot</code> and <code>TrEMBL</code>. The <code>TrEMBL</code> database contains all translated protein-coding sequences from public nucleic acid databases. The vast majority of these translated sequences are predictions - they have never been experimentally studied, or their existence confirmed. By contrast, the <code>Swiss-Prot</code> database contains only non-redundant, manually annotated protein sequences. Each sequence in <code>Swiss-Prot</code> carries an <em>evidence code</em> to indicate the strength of experimental evidence for the existence of the protein.</p>
Manual annotation takes time, and there are relatively few annotators. As a result, at the time of writing, there are over 400 proteins in <code>TrEMBL</code> for every protein in <code>Swiss-Prot</code>, and the ratio increases with every version of the database.
</details>
<div id="warning">
<p><strong>Watch out for unmaintained sites!</strong></p>
<p>Major databases are continuously updated and well-maintained. For example, the databases in the International Sequence Database Collaboration (INSDC: Genbank, EMBL, DDBJ) automatically update one another with new data collected daily.</p>
<p>However, not all databases are equally well-curated. Some will persist, even after they are no longer curated - and thus may contain out-of-date information, or not have the latest updates. <em>Caveat emptor…</em></p>
</div>
<div id="note">
<p><strong>Database versioning</strong></p>
<p>Database contents change over time as new data is added, redundant or incorrect data is removed, or otherwise modified. It is important for reproducibility, therefore, to report the version information appropriate to your dataset.</p>
<p>Some databases, such as <code>UniProt</code>, provide a <a href="https://www.uniprot.org/uniprot/B6J853?version=*">complete history</a> for each database entry. Other databases provide a version number for each accession, such as the number following the decimal point in <code>GenBank</code> records like <a href="https://www.ncbi.nlm.nih.gov/assembly/GCF_007858975.2/">GCF_007858975.2</a>. Others may provide a release version number for the database as a whole - and <code>GenBank</code> does this also, every two months (<a href="https://www.ncbi.nlm.nih.gov/genbank/release/">GenBank releases</a>). Some databases provide no version information, and these should be reported with a date when the database was accessed.</p>
</div>
</div>
<div id="different-sites-may-have-very-different-accessfunctionalitysearching" class="section level2" number="2.2">
<h2><span class="header-section-number">2.2</span> Different sites may have very different access/functionality/searching</h2>
<p>Most databases can be searched by keywords, though the search interfaces may vary. Many sequence databases will allow you to search by sequence similarity (e.g., <code>BLASTP</code> to search for proteins similar to a query protein).</p>
<div id="warning">
<p>Pay careful attention to your search terms!</p>
<p>As with any query to a computer database, keyword searches in biological databases are sensitive to typos or spelling errors. For example, searching <code>UniProt</code> for “lipaomide dehydrogenase” retrieves <a href="https://www.uniprot.org/uniprot/?query=lipaomide+dehydrogenase&sort=score">0 results</a>, while the correctly spelled search “lipoamide dehydrogenase” retrieves <a href="https://www.uniprot.org/uniprot/?query=lipoamide+dehydrogenase&sort=score">thousands of results</a>.</p>
</div>
</div>
</div>
<div id="what-are-accession-numbers-and-how-can-we-use-them" class="section level1" number="3">
<h1><span class="header-section-number">3</span> What are accession numbers and how can we use them?</h1>
<p>Records deposited into a database will be assigned a unique identifier, called an <em>accession number</em>. An accession number will <em>always</em> refer to the same record.</p>
<p>The format of accession numbers varies between databases:</p>
<ul>
<li>Genbank acccession: <a href="https://www.ncbi.nlm.nih.gov/protein/BAD70110">BAD70110.1</a></li>
<li>UniProt accession <a href="https://www.uniprot.org/uniprot/Q5SLK6">Q5SLK6</a></li>
<li>PDB accession <a href="https://www.rcsb.org/structure/2eq7">2EQ7</a></li>
</ul>
<div id="note">
<p>Sometimes it may be necessary to update a particular database entry when new experimental data becomes available (e.g., to correct errors or add new information about the sequence or molecule.) The previous database entry will be preserved (it is part of the permanent scientific record!), but an updated version will be published to the database, with a new version of the accession number.</p>
<p>For example, the sequence of the well-studied model organism <em>Escherichia coli</em> (MG1655) was deposited to Genbank under the access number NC_000913.1 in June 2004 and this record can still be accessed at <a href="https://www.ncbi.nlm.nih.gov/nuccore/NC_000913.1" class="uri">https://www.ncbi.nlm.nih.gov/nuccore/NC_000913.1</a>. However, since then a few errors in the sequence have been corrected, and the annotation updated: the current record (at time of writing) has accession number NC_000913.3 can be found at <a href="https://www.ncbi.nlm.nih.gov/nuccore/NC_000913.3" class="uri">https://www.ncbi.nlm.nih.gov/nuccore/NC_000913.3</a>.</p>
<p>Note that the old version has a warning note attached to it and a link to the current version.</p>
<p><strong>Although an accession number is sufficient to identify the record, the version number should also be cited.</strong></p>
</div>
</div>
<div id="how-do-we-cite-data-from-public-databases" class="section level1" number="4">
<h1><span class="header-section-number">4</span> How do we cite data from public databases?</h1>
<p>Give the URL to the database, the accession number(s) of the data used, and state the date of access or version number (if available).</p>
</div>
<div id="examples-of-how-to-search-public-databases-and-retrieve-information" class="section level1" number="5">
<h1><span class="header-section-number">5</span> Examples of how to search public databases and retrieve information</h1>
<p>To look at how you might use public databases in the context of a capstone project, we will take the example of a beta-lactamase (carbapenemase) called NDM-1, which makes bacteria resistant to a broad range of beta-lactam antibiotics. The spread of bacteria carrying these carbapenemase genes is a major public health concern.</p>
<details>
<summary>
Click to toggle an example of how to search the NCBI databases (example of how to search using a keyword query)
</summary>
<p>Our first task is to find the NDM-1 amino acid sequence. To do this, we will search the NCBI databases (starting from <a href="https://www.ncbi.nlm.nih.gov/" class="uri">https://www.ncbi.nlm.nih.gov/</a>).
[The search bar is at the top of the page (arrow). Note that we can choose which of the NCBI databases we would like to search (arrow) - for this example, we will keep the default (all databases), but we could equally choose to search the Protein database - since the protein sequence is what we are looking for.]</p>
<div class="figure"><span style="display:block;" id="fig:img1"></span>
<img src="images/img1.JPG" alt="An NCBI search for ndm-1" />
<p class="caption">
Figure 5.1: An NCBI search for ndm-1
</p>
</div>
Once we have searched for “ndm-1” in the databases, we see the following page of results, showing the entries which match our query in <em>all</em> the databases at NCBI. Since we want the protein sequence, we will click on the link to those results (box).
<div class="figure"><span style="display:block;" id="fig:img2"></span>
<img src="images/img2.JPG" alt="Results page for an NCBI search for ndm-1" />
<p class="caption">
Figure 5.2: Results page for an NCBI search for ndm-1
</p>
</div>
This then takes us to a page displaying all of the hits matching our “ndm-1” query in the Protein database. In this case, we want the sequence from <em>Klebsiella pneumoniae</em>, the organism where NDM-1 was first discovered.
<div class="figure"><span style="display:block;" id="fig:img3"></span>
<img src="images/img3.JPG" alt="Protein database hits from an NCBI search for ndm-1" />
<p class="caption">
Figure 5.3: Protein database hits from an NCBI search for ndm-1
</p>
</div>
Clicking on the “NDM-1 (Klebsiella pneumoniae)” link shown in the previous image, brings us to the results page. By default, this is in the GenPept format - you can use the pulldown menu (top-left box) to change the format, e.g. to a FASTA format. The results page displays information including the accession number for this protein (AQT38377), the amino acid sequence, and information about different features in the protein (e.g., metal-binding sites). Note that you can download this information if you wish, by selecting “Send to:” (top right box), and in the pop-up menu - Choose Destination: File.
<div class="figure"><span style="display:block;" id="fig:img4"></span>
<img src="images/img4.JPG" alt="Klebsiella pneumoniae NDM-1 NCBI page" />
<p class="caption">
Figure 5.4: Klebsiella pneumoniae NDM-1 NCBI page
</p>
</div>
</details>
<details>
<summary>
Click to toggle an example of how to search the Comprehensive Antibiotic Resistance Database (CARD) (example of how to search using a keyword query with autocomplete)
</summary>
<p>We might also want to find information about which other species have been found to have NDM-1, or sequence variants that have been observed in different strains. For this, we will turn to the CARD database (<a href="https://card.mcmaster.ca/" class="uri">https://card.mcmaster.ca/</a>).</p>
The search bar for the CARD database is at the top right hand corner of the page. Note that as we start typing in our “ndm-1” query, a drop-down list of autocomplete options appears (arrow). We select and search for the option we want.
<div class="figure"><span style="display:block;" id="fig:img5"></span>
<img src="images/img5.JPG" alt="A CARD search for ndm-1" />
<p class="caption">
Figure 5.5: A CARD search for ndm-1
</p>
</div>
The results page from our query: this gives us a list of resistomes with perfect matches and with sequence variants, information about the protein - including a list of publications with PubMed links - and a look at the prevalence of NDM-1 in 263 important pathogens. Note that the accession number (ARO:3000589) is different than the accession number for the <em>K. pneumoniae</em> protein we looked at in the NCBI Protein database.
<div class="figure"><span style="display:block;" id="fig:img6"></span>
<img src="images/img6.JPG" alt="ndm-1 results page from CARD search" />
<p class="caption">
Figure 5.6: ndm-1 results page from CARD search
</p>
</div>
</details>
<details>
<summary>
Click to toggle an example of how to search the Protein Data Bank (PDB) (example of how to search using a keyword query)
</summary>
<p>We also want to find the 3D structure of the NDM-1 protein. To do this, we turn to the Protein Data Bank (PDB) at <a href="https://www.rcsb.org/" class="uri">https://www.rcsb.org/</a>.</p>
The search bar is at the top right-hand corner of the screen (arrow). We enter our “ndm-1” query and hit search.
<div class="figure"><span style="display:block;" id="fig:img7"></span>
<img src="images/img7.JPG" alt="A PDB search for ndm-1" />
<p class="caption">
Figure 5.7: A PDB search for ndm-1
</p>
</div>
The results page offers an Advanced Search query builder (top arrow), and a number of options for filtering our search (Refinements - bottom arrow). We could, for example, specify which organism we were interested in, or which experimental method - e.g., X-ray diffraction or solution NMR.
<div class="figure"><span style="display:block;" id="fig:img8"></span>
<img src="images/img8.JPG" alt="PDB search results for ndm-1" />
<p class="caption">
Figure 5.8: PDB search results for ndm-1
</p>
</div>
The results page shows a ribbon diagram of the protein and allows us to download the structural data, which we can analyse using software on our computers. The results page also links to the associated paper where this structure was first published, where you can learn more about how these data were generated.
<div class="figure"><span style="display:block;" id="fig:img9"></span>
<img src="images/img9.JPG" alt="PDB page for NDM-1" />
<p class="caption">
Figure 5.9: PDB page for NDM-1
</p>
</div>
</details>
<details>
<summary>
Click to toggle an example of how to search the Pfam database (example of how to search a database using a sequence query)
</summary>
<p>Having learned about the sequence and structure of our NDM-1 protein, we now want to learn about any related proteins. To do this, we will turn to the Pfam database (a database of protein families), at <a href="http://pfam.xfam.org/" class="uri">http://pfam.xfam.org/</a>.</p>
There are a number of different ways to search the Pfam database (centre of the page), including by keyword, accession number, or protein sequence.
<div class="figure"><span style="display:block;" id="fig:img10"></span>
<img src="images/img10.JPG" alt="The Pfam database main page" />
<p class="caption">
Figure 5.10: The Pfam database main page
</p>
</div>
In our case, we already have the protein sequence (from NCBI), so we will use that to search Pfam. Clicking on “sequence search” brings up a query box where we can paste the sequence of NDM-1 from <em>K. pneumoniae</em>
<div class="figure"><span style="display:block;" id="fig:img11"></span>
<img src="images/img11.JPG" alt="A Pfam sequence search using the NDM-1 amino acid sequence" />
<p class="caption">
Figure 5.11: A Pfam sequence search using the NDM-1 amino acid sequence
</p>
</div>
The results page will bring up all matching protein families. In this case, there is only one match - the Lactamase B family. Note that the search results will also tell you which parts of your query match the Pfam family, which can be important for multidomain proteins.
<div class="figure"><span style="display:block;" id="fig:img12"></span>
<img src="images/img12.JPG" alt="Pfam sequence search results" />
<p class="caption">
Figure 5.12: Pfam sequence search results
</p>
</div>
Clicking on the Lactamase B family brings us to the Pfam page about this protein family (accession PF00753). By default, you land on the “Summary” part of the page, but you can access more information by clicking on the headings (e.g. Domain organization, Clan, Alignments, etc.) in the menu on the left. There is also a menu in pale blue across the top, with links to the protein architectures, sequences, interactions, species, and structures.
<div class="figure"><span style="display:block;" id="fig:img13"></span>
<img src="images/img13.JPG" alt="Pfam page for the Lactamase B family" />
<p class="caption">
Figure 5.13: Pfam page for the Lactamase B family
</p>
</div>
</details>
<details>
<summary>
Click to toggle an example of how to search the UniProt database (example of advanced search using Boolean terms)
</summary>
<p>We also want to find out what is known about NDM-1 function, i.e. the catalytic activity of the protein. For this, we will turn to the UniProt database at <a href="https://www.uniprot.org/" class="uri">https://www.uniprot.org/</a>.</p>
We enter our keyword query “ndm-1” into the search bar at the top of the page. However, we specifically want to find more information about NDM-1 from <em>K. pneumoniae</em>, so we next click “Advanced” (instead of “Search”).
<div class="figure"><span style="display:block;" id="fig:img14"></span>
<img src="images/img14.JPG" alt="A UniProt search for ndm-1" />
<p class="caption">
Figure 5.14: A UniProt search for ndm-1
</p>
</div>
We can add additional search terms to our query by clicking on the + icon. In this case, we want to restrict our search to <em>K. pneumoniae</em>, so we use Organism (arrow) and type in “Klebsiella”. A drop-down list appears highlighting potential choices and in our case, we select “Klebsiella pneumoniae”.
<div class="figure"><span style="display:block;" id="fig:img15"></span>
<img src="images/img15.JPG" alt="Building a UniProt Advanced Search" />
<p class="caption">
Figure 5.15: Building a UniProt Advanced Search
</p>
</div>
The results page brings up a number of proteins from <em>Klebsiella pneumoniae</em> that match our search. Note that there are additional options for filtering the search in the left-hand menu bar. For example, we might want to filter by “Reviewed” versus “Unreviewed” records. However, the NDM-1 protein that we are looking for is the top hit in the list.
<div class="figure"><span style="display:block;" id="fig:img16"></span>
<img src="images/img16.JPG" alt="UniProt search results for ndm-1 proteins from Klebsiella pneumoniae" />
<p class="caption">
Figure 5.16: UniProt search results for ndm-1 proteins from Klebsiella pneumoniae
</p>
</div>
The page for our protein, NDM-1 from <em>K. pneumoniae</em>. This page has information about the catalytic activity, cofactor, regulation, kinetics, localization, etc. etc. There are also links to a number of other databases and information about similar proteins.
<div class="figure"><span style="display:block;" id="fig:img17"></span>
<img src="images/img17.JPG" alt="UniProt page for the K. pneumoniae NDM-1 protein" />
<p class="caption">
Figure 5.17: UniProt page for the K. pneumoniae NDM-1 protein
</p>
</div>
</details>
</div>
</div>
</div>
</div>
<script>
// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
$('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
bootstrapStylePandocTables();
});
</script>
<!-- tabsets -->
<script>
$(document).ready(function () {
window.buildTabsets("TOC");
});
$(document).ready(function () {
$('.tabset-dropdown > .nav-tabs > li').click(function () {
$(this).parent().toggleClass('nav-tabs-open');
});
});
</script>
<!-- code folding -->
<script>
$(document).ready(function () {
// temporarily add toc-ignore selector to headers for the consistency with Pandoc
$('.unlisted.unnumbered').addClass('toc-ignore')
// move toc-ignore selectors from section div to header
$('div.section.toc-ignore')
.removeClass('toc-ignore')
.children('h1,h2,h3,h4,h5').addClass('toc-ignore');
// establish options
var options = {
selectors: "h1,h2,h3",
theme: "bootstrap3",
context: '.toc-content',
hashGenerator: function (text) {
return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_');
},
ignoreSelector: ".toc-ignore",
scrollTo: 0
};
options.showAndHide = true;
options.smoothScroll = true;
// tocify
var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>