# Connect to the CPTAC-2 open data bucket on AWS
<- arrow::s3_bucket("s3://gdc-cptac-2-phs000892-2-open/") cptac_s3
Looking up open datasets in AWS Marketplace using {arrow}
Using the {arrow} package to look into and. use datasets in S3 storage
Introduction
Have you ever wanted to dive into huge biological datasets, do a little citizen science, or just sharpen your coding skills? AWS Open Data Registry is a treasure trove of public datasets you can access for free! Today, we’ll explore the Clinical Proteomic Tumor Analysis Consortium (CPTAC-2) dataset.
CPTAC-2 is part of a national effort to accelerate our understanding of cancer biology through proteogenomics. The datasets include RNA-Seq, miRNA quantification, and other valuable tools for cancer research. Exciting stuff, right? Let’s access and explore it using R and packages like {arrow}
and {purrr}
.
Key Packages and Techniques
This post relies on a few key packages and techniques to unlock the data:
{arrow}
: Enables seamless access to data stored in AWS S3 buckets and efficient handling of large datasets.{purrr}
: Makes iterating over data structures easy and expressive.{stringr}
: Provides handy tools for string manipulation.{tibble}
: Simplifies working with tabular data in R.{dplyr}
: A powerful tool for data manipulation and transformation.
Here’s how we’ll use these tools to: 1. Connect to an AWS S3 bucket and explore its structure. 2. List and classify the available files. 3. Load specific datasets into dataframes based on their type for analysis.
By the end, you’ll know how to apply these packages to answer key questions like: - What datasets are available in the CPTAC-2 bucket? - How can we identify and load the files we need? - What insights can we draw from these datasets?
Exploring the Data
Connecting to AWS S3 Storage
The CPTAC data lives on an S3 bucket in AWS. To access it, we’ll use arrow::s3_bucket
. The {arrow}
package makes working with cloud storage and large datasets seamless. Let’s set up the connection:
Boom! We now have a pipeline into the bucket, and we can start peeking at its contents.
Listing Folders and Files
The first step is to see what’s inside this giant bucket.
Listing Folders
# List all folders in the bucket
<- cptac_s3$ls()
cptac_folders length(cptac_folders)
[1] 3984
1:5] cptac_folders[
[1] "00308a6b-56f8-4e51-9b4b-500ca6d32387"
[2] "006fc0cd-8419-4278-887b-ee922340fd85"
[3] "0088029e-05f1-461c-a30a-2d6628a7f8d2"
[4] "00997ac4-8cc4-4c90-a2ab-d4a36068faf0"
[5] "00d681ae-9382-4224-a8a2-d51fd4dbaa28"
Here, cptac_s3$ls()
gives us the top-level folder names in the bucket.
Listing Files in Each Folder
Now that we have folders, let’s dive into them and list the files inside. We’ll use the purrr::map_chr()
function to map over the folders and fetch the file names.
# List all files within the folders
<- purrr::map_chr(
cptac_files
cptac_folders,~{ cptac_s3$ls(.x) }
)length(cptac_files)
[1] 3984
1:5] cptac_files[
[1] "00308a6b-56f8-4e51-9b4b-500ca6d32387/35036955-3a03-4b7a-bd55-d8bc721f99c4.htseq_counts.txt.gz"
[2] "006fc0cd-8419-4278-887b-ee922340fd85/fcc26511-0b8b-4914-ad7e-bb97a3cd03f8.FPKM-UQ.txt.gz"
[3] "0088029e-05f1-461c-a30a-2d6628a7f8d2/34566acf-5f1f-4b7c-859a-5c7c2c38ed8e.wxs.aliquot_ensemble_masked.maf.gz"
[4] "00997ac4-8cc4-4c90-a2ab-d4a36068faf0/4dc10240-c4ca-4b92-866b-af1542575fc8.rna_seq.augmented_star_gene_counts.tsv"
[5] "00d681ae-9382-4224-a8a2-d51fd4dbaa28/c2de32e3-b9bc-49e1-9dbe-8c09b48ff637.htseq_counts.txt.gz"
This step takes each folder and retrieves the files within.
Classifying File Types
Let’s get curious: what types of data are in these files? We’ll extract file extensions to see what formats are available.
# Extract file types by splitting filenames
<- purrr::map_chr(
cptac_filetypes
cptac_files,~{
basename(.x) |>
(::str_split_fixed(pattern = "\\.", n = 2))[2]
stringr
}
)
# Count the occurrences of each file type
table(cptac_filetypes)
cptac_filetypes
FPKM-UQ.txt.gz FPKM.txt.gz
340 340
htseq_counts.txt.gz mirnaseq.isoforms.quantification.txt
340 650
mirnaseq.mirnas.quantification.txt rna_seq.augmented_star_gene_counts.tsv
650 340
rna_seq.star_gene_counts.tsv.gz wxs.aliquot_ensemble_masked.maf.gz
340 984
Here, we: 1. Use basename()
to get the file name without the folder path. 2. Split the file name at the first dot (\\.
) to extract extensions. 3. Count the file types using table()
.
This gives us a nice summary of the types of files: .txt.gz, .tsv, and so on.
Loading Specific Datasets
Now for the fun part: let’s load and explore some actual data. We’ll define rules to read specific file types, like gene expression files and miRNA quantification data. Since this is a lot of files to structure, we’ll just go through a few examples. Here’s the magic:
# Read in data based on file type
<- tibble::tibble(
cptac_s3_df_20
cptac_folders,
cptac_files,
cptac_filetypes|>
) head(20) |>
::mutate(
dplyrdf = list(NULL),
df = purrr::map2(cptac_files, cptac_filetypes, ~{
if(.y %in% c("htseq_counts.txt.gz", "FPKM-UQ.txt.gz", "FPKM.txt.gz")){
$path(.x) |>
cptac_s3::read_delim_arrow(delim = "\t", col_names = FALSE) |>
arrow::tibble() |>
tibble::rename(ensembl_id = f0, value = f1)
dplyrelse if(.y %in% c("rna_seq.augmented_star_gene_counts.tsv")){
} $path(.x) |>
cptac_s3::read_delim_arrow(delim = "\t", skip = 1) |>
arrow::tibble()
tibbleelse if(.y %in% c("mirnaseq.mirnas.quantification.txt", "mirnaseq.isoforms.quantification.txt")){
} $path(.x) |>
cptac_s3::read_delim_arrow(delim = "\t") |>
arrow::tibble()
tibbleelse if(.y %in% c("wxs.aliquot_ensemble_masked.maf.gz")){
} $path(.x) |>
cptac_s3::read_delim_arrow(delim = "\t", skip = 7) |>
arrow::tibble()
tibble
}
})
) cptac_s3_df_20
# A tibble: 20 × 4
cptac_folders cptac_files cptac_filetypes df
<chr> <chr> <chr> <list>
1 00308a6b-56f8-4e51-9b4b-500ca6d32387 00308a6b-56f8-… htseq_counts.t… <tibble>
2 006fc0cd-8419-4278-887b-ee922340fd85 006fc0cd-8419-… FPKM-UQ.txt.gz <tibble>
3 0088029e-05f1-461c-a30a-2d6628a7f8d2 0088029e-05f1-… wxs.aliquot_en… <tibble>
4 00997ac4-8cc4-4c90-a2ab-d4a36068faf0 00997ac4-8cc4-… rna_seq.augmen… <tibble>
5 00d681ae-9382-4224-a8a2-d51fd4dbaa28 00d681ae-9382-… htseq_counts.t… <tibble>
6 00fdd1b4-a87d-4db4-90d3-4c73ff1c9c3e 00fdd1b4-a87d-… wxs.aliquot_en… <tibble>
7 012a0d15-2d7d-4bab-af3f-ba3d808331c1 012a0d15-2d7d-… mirnaseq.isofo… <tibble>
8 01302114-2e59-4695-8ec5-93883b9e8f44 01302114-2e59-… FPKM-UQ.txt.gz <tibble>
9 013e52b9-1a81-41ef-a287-71bfcfb5c5eb 013e52b9-1a81-… mirnaseq.mirna… <tibble>
10 0144f8ae-ebd9-421f-abfd-83da589a8660 0144f8ae-ebd9-… FPKM.txt.gz <tibble>
11 0150be78-91fe-4628-bed9-f79c74aa31d1 0150be78-91fe-… htseq_counts.t… <tibble>
12 015c8c0a-421f-4a58-86fb-664575786002 015c8c0a-421f-… FPKM.txt.gz <tibble>
13 016a2336-115d-4542-82ce-f5b4d5e08805 016a2336-115d-… rna_seq.augmen… <tibble>
14 017d77e5-c9d9-4888-a0c5-2b27a26910c3 017d77e5-c9d9-… FPKM-UQ.txt.gz <tibble>
15 01886ea8-8c31-4e41-b136-731fd79e775e 01886ea8-8c31-… mirnaseq.isofo… <tibble>
16 018f0332-4225-48f2-8ba9-6f2dad922532 018f0332-4225-… mirnaseq.isofo… <tibble>
17 01947104-063e-4bf5-ab9e-b9797ccdfcee 01947104-063e-… mirnaseq.mirna… <tibble>
18 01c70055-4080-49c4-8980-9e152f8b9635 01c70055-4080-… FPKM.txt.gz <tibble>
19 01c7a24a-ca38-4961-87bf-a91734df83cd 01c7a24a-ca38-… FPKM-UQ.txt.gz <tibble>
20 01cae191-097d-4df3-b599-f3709c4a95f1 01cae191-097d-… mirnaseq.mirna… <tibble>
Here’s what happens: - File Types: Depending on the file type (RNA-seq, miRNA, etc.), we apply different parsing rules. - arrow::read_delim_arrow()
: Reads data efficiently from the bucket. - Skipping Headers: Some files need skipping rows before the actual data.
We now have actual dataframes loaded into the df
column for exploration.
Wrapping Up
In this post, we: 1. Connected to an AWS S3 bucket to access the CPTAC-2 dataset. 2. Explored the folder and file structure. 3. Classified file types to understand the available data. 4. Loaded specific datasets into R for analysis.
The CPTAC-2 dataset is a treasure trove for exploring cancer biology, and tools like arrow
and purrr
make accessing and analyzing such massive datasets both accessible and fun. The possibilities are endless—whether for academic research, personal projects, or learning new coding techniques.
So, go on and explore some datasets!