diff --git a/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py b/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py index cc846f10..bc42772b 100644 --- a/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py +++ b/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py @@ -107,7 +107,7 @@ def get_merged_variants(self, variants, key=None): splits = IntervalTree() for v in variants: - non_variant_interval = non_variant_tree.search(v.start, v.end) + non_variant_interval = non_variant_tree.overlap(v.start, v.end) if non_variant_interval: try: non_variant = next(iter(non_variant_interval)).data @@ -167,7 +167,7 @@ def _update_splits(self, splits, v): start = v.start end = v.end # Increase split size to also combine adjacent splits - overlapping_splits = splits.search(v.start - 1, v.end + 1) + overlapping_splits = splits.overlap(v.start - 1, v.end + 1) if overlapping_splits: for split in overlapping_splits: splits.remove(split) @@ -177,7 +177,7 @@ def _update_splits(self, splits, v): def _split_non_variants(self, non_variant_tree, splits): for split in splits: - overlapping_non_variants = non_variant_tree.search(split.begin, split.end) + overlapping_non_variants = non_variant_tree.overlap(split.begin, split.end) for onv_interval in overlapping_non_variants: non_variant_tree.remove(onv_interval) onv = onv_interval.data diff --git a/gcp_variant_transforms/libs/variant_sharding.py b/gcp_variant_transforms/libs/variant_sharding.py index 95731644..eb10e88c 100644 --- a/gcp_variant_transforms/libs/variant_sharding.py +++ b/gcp_variant_transforms/libs/variant_sharding.py @@ -84,7 +84,7 @@ def get_shard_index(self, pos=0): If no interval is found returns _UNDEFINED_SHARD_INDEX. """ - matched_regions = self._interval_tree.search(pos) + matched_regions = self._interval_tree.at(pos) # Ensure at most one region is matching to the give position. assert len(matched_regions) <= 1 if len(matched_regions) == 1: diff --git a/gcp_variant_transforms/transforms/annotate_files.py b/gcp_variant_transforms/transforms/annotate_files.py index cf489c19..653433a6 100644 --- a/gcp_variant_transforms/transforms/annotate_files.py +++ b/gcp_variant_transforms/transforms/annotate_files.py @@ -52,7 +52,7 @@ def process(self, input_pattern): t = threading.Thread(target=self._annotate_files, args=(input_pattern, watchdog_file,)) t.start() - while t.isAlive(): + while t.is_alive(): with filesystems.FileSystems.create(watchdog_file) as file_to_write: file_to_write.write(b'Watchdog file.') time.sleep(_WATCHDOG_FILE_UPDATE_INTERVAL_SECONDS) diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 00000000..126e3526 --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,482 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true, + "authorship_tag": "ABX9TyPUQL1tg5nb/KxJ3+o82kD7", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Set up" + ], + "metadata": { + "id": "U9frHX8SzHLH" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Clone the repo" + ], + "metadata": { + "id": "xtaoM88dzOta" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMw9tCD7w9ms" + }, + "outputs": [], + "source": [ + "%cd ~/\n", + "!rm -rf gcp-variant-transforms\n", + "!git clone https://github.com/aihgii/gcp-variant-transforms.git\n", + "%cd gcp-variant-transforms" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Install requirements" + ], + "metadata": { + "id": "KDtReD90Oa3M" + } + }, + { + "cell_type": "code", + "source": [ + "!pip3 install -r requirements.txt" + ], + "metadata": { + "id": "WltDnVdZCJ7F" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Import dependencies" + ], + "metadata": { + "id": "SD9dMUb_zVKW" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from string import Template" + ], + "metadata": { + "id": "GE3rP96fzaBI" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Set up credentials and environment variables" + ], + "metadata": { + "id": "lX3vWv6uzeXr" + } + }, + { + "cell_type": "code", + "source": [ + "!gcloud auth login" + ], + "metadata": { + "id": "a5yAo3BP7-lt" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!gcloud auth application-default login" + ], + "metadata": { + "id": "VRtFErc_yPcs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "CREDENTIALS = \"/content/.config/application_default_credentials.json\" # @param {type:\"string\"}\n", + "PROJECT = \"\" # @param {type:\"string\"}\n", + "REGION = \"us-west1\" # @param {type:\"string\"}\n", + "ZONE = \"us-west1-b\" # @param {type:\"string\"}\n", + "BUCKET = \"\" # @param {type:\"string\"}\n", + "DATASET = \"genomics\" # @param {type:\"string\"}\n", + "TABLE = \"variants\" # @param {type:\"string\"}\n", + "\n", + "os.environ.update({\n", + " \"GOOGLE_APPLICATION_CREDENTIALS\": CREDENTIALS,\n", + " \"GOOGLE_CLOUD_PROJECT\": PROJECT,\n", + " \"GOOGLE_CLOUD_REGION\": REGION,\n", + " \"GOOGLE_CLOUD_ZONE\": ZONE,\n", + " \"GCS_BUCKET\": BUCKET,\n", + " \"BQ_DATASET\": DATASET,\n", + " \"BQ_TABLE\": TABLE\n", + "})" + ], + "metadata": { + "id": "l6UGJTHfy8lg", + "cellView": "form" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!gcloud config set project $GOOGLE_CLOUD_PROJECT" + ], + "metadata": { + "id": "tZu2Ad3S07fr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Set up infrastructure" + ], + "metadata": { + "id": "GXi3tUJx1DF7" + } + }, + { + "cell_type": "code", + "source": [ + "!wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg\n", + "!echo \"deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main\" | sudo tee /etc/apt/sources.list.d/hashicorp.list\n", + "!sudo apt update && sudo apt install terraform" + ], + "metadata": { + "id": "WIrq5mJ61FwW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%writefile main.tf\n", + "\n", + "terraform {\n", + " required_providers {\n", + " google = {\n", + " source = \"hashicorp/google\"\n", + " version = \"4.51.0\"\n", + " }\n", + " }\n", + "}\n", + "\n", + "provider \"google\" {\n", + " credentials = \"${GOOGLE_APPLICATION_CREDENTIALS}\"\n", + " project = \"${GOOGLE_CLOUD_PROJECT}\"\n", + " region = \"${GOOGLE_CLOUD_REGION}\"\n", + " zone = \"${GOOGLE_CLOUD_ZONE}\"\n", + "}\n", + "\n", + "module \"project-services\" {\n", + " source = \"terraform-google-modules/project-factory/google//modules/project_services\"\n", + " version = \"~> 14.4\"\n", + "\n", + " project_id = \"${GOOGLE_CLOUD_PROJECT}\"\n", + "\n", + " activate_apis = [\n", + " \"bigquery.googleapis.com\",\n", + " \"compute.googleapis.com\",\n", + " \"dataflow.googleapis.com\",\n", + " \"lifesciences.googleapis.com\",\n", + " \"storage-component.googleapis.com\"\n", + " ]\n", + "}\n", + "\n", + "resource \"google_storage_bucket\" \"${GCS_BUCKET}\" {\n", + " name = \"${GCS_BUCKET}\"\n", + " location = \"US\"\n", + " force_destroy = true\n", + "\n", + " public_access_prevention = \"enforced\"\n", + "}\n", + "\n", + "resource \"google_bigquery_dataset\" \"genomics\" {\n", + " dataset_id = \"${BQ_DATASET}\"\n", + "}" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vChvIrBl1Owx", + "outputId": "dd43d4c8-6300-486c-a0ed-98b09d099544" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Writing main.tf\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "with open('main.tf', 'r') as f:\n", + " tmp = Template(f.read()).substitute(os.environ)\n", + "with open('main.tf', 'w') as f:\n", + " f.write(tmp)" + ], + "metadata": { + "id": "eAVVygam19Rg" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!terraform init" + ], + "metadata": { + "id": "VY1TWfWD2ECU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!terraform plan" + ], + "metadata": { + "id": "jvgyWrUy2aPA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!terraform apply" + ], + "metadata": { + "id": "n34vpOGg2dwX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Playground" + ], + "metadata": { + "id": "tnjowMg46xvL" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Copying test data" + ], + "metadata": { + "id": "grMDC7Cf7f10" + } + }, + { + "cell_type": "code", + "source": [ + "!gsutil cp \\\n", + " gs://genomics-public-data/platinum-genomes/vcf/NA1287*_S1.genome.vcf \\\n", + " gs://$GCS_BUCKET/platinum-genomes/vcf/" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nOSb26eJ7iqT", + "outputId": "287d6f35-df70-428d-f6c4-b47875ffb865" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Copying gs://genomics-public-data/platinum-genomes/vcf/NA12877_S1.genome.vcf [Content-Type=text/x-vcard]...\n", + "Copying gs://genomics-public-data/platinum-genomes/vcf/NA12878_S1.genome.vcf [Content-Type=text/x-vcard]...\n", + "/ [2 files][ 10.2 GiB/ 10.2 GiB] \n", + "Operation completed over 2 objects/10.2 GiB. \n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Running VCF files to BigQuery Preprocessor" + ], + "metadata": { + "id": "N5HH4Uvi7K9e" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Direct runner" + ], + "metadata": { + "id": "mzmCqXvM0JRg" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m gcp_variant_transforms.vcf_to_bq_preprocess \\\n", + " --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \\\n", + " --report_path gs://$GCS_BUCKET/report.tsv \\\n", + " --job_name vcf-to-bigquery-preprocess-direct-runner \\\n", + " --resolved_headers_path gs://$GCS_BUCKET/resolved_headers.vcf \\\n", + " --temp_location gs://$GCS_BUCKET/temp" + ], + "metadata": { + "id": "wTiBcm8Xzpx_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Dataflow runner" + ], + "metadata": { + "id": "I34wEO2K0S9W" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m gcp_variant_transforms.vcf_to_bq_preprocess \\\n", + " --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \\\n", + " --report_path gs://$GCS_BUCKET/report.tsv \\\n", + " --job_name vcf-to-bigquery-preprocess \\\n", + " --resolved_headers_path gs://$GCS_BUCKET/resolved_headers.vcf \\\n", + " --report_all_conflicts true \\\n", + " --setup_file ./setup.py \\\n", + " --runner DataflowRunner \\\n", + " --project $GOOGLE_CLOUD_PROJECT \\\n", + " --region $GOOGLE_CLOUD_REGION \\\n", + " --temp_location gs://$GCS_BUCKET/temp \\\n", + " --requirements_file requirements.txt" + ], + "metadata": { + "id": "2OmSwYtv60ra" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Running VCF files to BigQuery transformation" + ], + "metadata": { + "id": "hXXeD0Pg0bmi" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Direct runner" + ], + "metadata": { + "id": "ev2zQwUJC6G7" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m gcp_variant_transforms.vcf_to_bq \\\n", + " --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \\\n", + " --output_table $GOOGLE_CLOUD_PROJECT:$BQ_DATASET.$BQ_TABLE \\\n", + " --job_name vcf-to-bigquery-direct-runner \\\n", + " --temp_location gs://$GCS_BUCKET/temp" + ], + "metadata": { + "id": "il0T5ebVDEpc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Dataflow runner" + ], + "metadata": { + "id": "z4ftIP5ICuuI" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m gcp_variant_transforms.vcf_to_bq \\\n", + " --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \\\n", + " --output_table $GOOGLE_CLOUD_PROJECT:$BQ_DATASET.$BQ_TABLE \\\n", + " --job_name vcf-to-bigquery \\\n", + " --setup_file ./setup.py \\\n", + " --runner DataflowRunner \\\n", + " --project $GOOGLE_CLOUD_PROJECT \\\n", + " --region $GOOGLE_CLOUD_REGION \\\n", + " --temp_location gs://$GCS_BUCKET/temp \\\n", + " --requirements_file requirements.txt" + ], + "metadata": { + "id": "PfkeTq6ripHG" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 97a30c4f..8d016c50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ -pysam<0.16.0 -cython>=0.28.1 -setuptools -#warning on apache-beam: https://cloud.google.com/dataflow/docs/guides/common-errors -apache-beam[gcp]==2.37.0 -avro-python3==1.10.2 +wheel==0.42.0 +Cython==3.0.8 +setuptools==69.0.3 +pysam==0.22.0 +apache-beam[gcp]==2.53.0 +avro==1.11.3 google-cloud-core -google-api-python-client>=1.6,<1.7.12 -intervaltree>=2.1.0,<2.2.0 -mmh3<2.6 -mock==4.0.3 +google-api-python-client==2.116.0 +intervaltree==3.1.0 +mmh3==4.1.0 +mock==5.1.0 google-cloud-storage pyfarmhash -pyyaml==5.4.1 -nose>=1.0 +pyyaml==6.0.1 +nose==1.3.7 \ No newline at end of file