diff --git a/.gitignore b/.gitignore index 87a7c79d8..0287700f8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *~ *.pyc +*.py.bak *.egg-info/ *\.egg/ .eggs/ @@ -7,6 +8,7 @@ build/ dist/ .idea/ venv/ +venv3/ .vscode/ .coverage *.DS_Store diff --git a/.travis.yml b/.travis.yml index 2e6afd640..e4414d188 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,8 @@ language: python python: - - "2.7" + - "3.7" install: + - python -m pip install wheel - python -m pip install --upgrade . - python -m pip install pylint - python -m pip install python-coveralls diff --git a/README.md b/README.md index df92a8a08..6386f46f0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +# This branch is under development and is not in working condition. +## Please rely solely on master branch for the initiation of the tool. + +# ------------------------------------------------ + # GCP Variant Transforms [![Build Status](https://travis-ci.org/googlegenomics/gcp-variant-transforms.svg?branch=master)](https://travis-ci.org/googlegenomics/gcp-variant-transforms) @@ -111,18 +116,20 @@ In addition to using the docker image, you may run the pipeline directly from source. First install git, python, pip, and virtualenv: ```bash -sudo apt-get install -y git python-pip python-dev build-essential -sudo python -m pip install --upgrade pip -sudo python -m pip install --upgrade virtualenv +sudo apt-get install -y git python3-pip python3-venv python3.7-venv python-dev build-essential ``` +Note that python 3.8 is not yet supported, so ensure you are using Python 3.7. + Run virtualenv, clone the repo, and install pip packages: ```bash -virtualenv venv -source venv/bin/activate +python3 -m venv venv3 +source venv3/bin/activate git clone https://github.com/googlegenomics/gcp-variant-transforms.git cd gcp-variant-transforms +python -m pip install --upgrade pip +python -m pip install --upgrade wheel python -m pip install --upgrade . ``` diff --git a/deploy_and_run_tests.sh b/deploy_and_run_tests.sh index f8efb165e..28a2d405e 100755 --- a/deploy_and_run_tests.sh +++ b/deploy_and_run_tests.sh @@ -188,9 +188,7 @@ parse_args() { # Deactivates virtualenv, removes its directory, and deletes the image. ################################################# clean_up() { - color_print "Removing integration test environment ${temp_dir}" "${GREEN}" deactivate - rm -rf "${temp_dir}" if [[ -z "${keep_image}" ]]; then # TODO(bashir2): Find a way to mark these images as temporary such that they are # garbage collected automatically if the test fails before this line. @@ -226,22 +224,11 @@ if [[ -z "${skip_build}" ]]; then --substitutions _CUSTOM_TAG_NAME="${image_tag}" . fi -# Running integration tests in a temporary virtualenv -temp_dir="$(mktemp -d)" -color_print "Setting up integration test environment in ${temp_dir}" "${GREEN}" -# Since we have no prompt we need to disable prompt changing in virtualenv. -export VIRTUAL_ENV_DISABLE_PROMPT="something" -virtualenv "${temp_dir}" -source ${temp_dir}/bin/activate; +source /opt/gcp_variant_transforms/venv3/bin/activate; trap clean_up EXIT if [[ -n "${run_unit_tests}" ]]; then - python -m pip install --upgrade . python setup.py test fi -python -m pip install --upgrade .[int_test] - -# Force an upgrade to avoid SSL certificate verification errors (issue #453). -python -m pip install --upgrade httplib2 color_print "Running integration tests against ${full_image_name}" "${GREEN}" python gcp_variant_transforms/testing/integration/run_vcf_to_bq_tests.py \ diff --git a/docker/Dockerfile b/docker/Dockerfile index 455d5ddfd..50890494c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -32,7 +32,7 @@ ADD / /opt/gcp_variant_transforms/src/ # Needed for installing mmh3 (one of the required packages in setup.py). RUN apt install -y g++ -# Install Pysam dependencies. These dependencies are only required becuase we +# Install Pysam dependencies. These dependencies are only required because we # have a monolithic binary - they primarily have to be installed on the workers. RUN apt-get update && apt-get install -y \ autoconf \ @@ -45,30 +45,31 @@ RUN apt-get update && apt-get install -y \ make \ perl \ zlib1g-dev \ - python-pip + python3-pip \ + python3-venv # Install dependencies. -RUN python -m pip install --upgrade pip && \ - python -m pip install --upgrade virtualenv && \ - virtualenv /opt/gcp_variant_transforms/venv && \ - . /opt/gcp_variant_transforms/venv/bin/activate && \ +RUN python3 -m venv /opt/gcp_variant_transforms/venv3 && \ + . /opt/gcp_variant_transforms/venv3/bin/activate && \ cd /opt/gcp_variant_transforms/src && \ - python -m pip install --upgrade . + python3 -m pip install --upgrade pip && \ + python3 -m pip install --upgrade wheel && \ + python3 -m pip install --upgrade . RUN printf '#!/bin/bash\n%s\n%s' \ - ". /opt/gcp_variant_transforms/venv/bin/activate && cd /opt/gcp_variant_transforms/src" \ + ". /opt/gcp_variant_transforms/venv3/bin/activate && cd /opt/gcp_variant_transforms/src" \ 'python -m gcp_variant_transforms.vcf_to_bq --setup_file ./setup.py "$@"' > \ /opt/gcp_variant_transforms/bin/vcf_to_bq && \ chmod +x /opt/gcp_variant_transforms/bin/vcf_to_bq RUN printf '#!/bin/bash\n%s\n%s' \ - ". /opt/gcp_variant_transforms/venv/bin/activate && cd /opt/gcp_variant_transforms/src" \ + ". /opt/gcp_variant_transforms/venv3/bin/activate && cd /opt/gcp_variant_transforms/src" \ 'python -m gcp_variant_transforms.vcf_to_bq_preprocess --setup_file ./setup.py "$@"' > \ /opt/gcp_variant_transforms/bin/vcf_to_bq_preprocess && \ chmod +x /opt/gcp_variant_transforms/bin/vcf_to_bq_preprocess RUN printf '#!/bin/bash\n%s\n%s' \ - ". /opt/gcp_variant_transforms/venv/bin/activate && cd /opt/gcp_variant_transforms/src" \ + ". /opt/gcp_variant_transforms/venv3/bin/activate && cd /opt/gcp_variant_transforms/src" \ 'python -m gcp_variant_transforms.bq_to_vcf --setup_file ./setup.py "$@"' > \ /opt/gcp_variant_transforms/bin/bq_to_vcf && \ chmod +x /opt/gcp_variant_transforms/bin/bq_to_vcf diff --git a/docs/bigquery_to_vcf.md b/docs/bigquery_to_vcf.md index 95005e6c0..35f61fde8 100644 --- a/docs/bigquery_to_vcf.md +++ b/docs/bigquery_to_vcf.md @@ -111,7 +111,7 @@ source. Example command for DirectRunner: ```bash -python -m gcp_variant_transforms.bq_to_vcf \ +python3 -m gcp_variant_transforms.bq_to_vcf \ --input_table bigquery-public-data:human_genome_variants.1000_genomes_phase_3_variants_20150220 \ --output_file gs://BUCKET/loaded_file.vcf \ --job_name bq-to-vcf-direct-runner \ @@ -124,7 +124,7 @@ python -m gcp_variant_transforms.bq_to_vcf \ Example command for DataflowRunner: ```bash -python -m gcp_variant_transforms.bq_to_vcf \ +python3 -m gcp_variant_transforms.bq_to_vcf \ --input_table bigquery-public-data:human_genome_variants.1000_genomes_phase_3_variants_20150220 \ --output_file gs://BUCKET/loaded_file.vcf \ --job_name bq-to-vcf \ diff --git a/docs/development_guide.md b/docs/development_guide.md index 33d5ecdc7..b7ee1723e 100644 --- a/docs/development_guide.md +++ b/docs/development_guide.md @@ -41,17 +41,19 @@ git remote add upstream git@github.com:googlegenomics/gcp-variant-transforms.git #### Setup virtualenv +Ensure you are using Python 3.7 version, since Apache Beam does not support 3.8. + ```bash -sudo apt-get install python-pip python-dev build-essential -sudo python -m pip install --upgrade pip -sudo python -m pip install --upgrade virtualenv -virtualenv venv -. venv/bin/activate +sudo apt-get install python3-pip python3-venv python3.7-venv python-dev build-essential +python3 -m venv venv3 +. venv3/bin/activate ``` #### Install dependences ```bash +python -m pip install --upgrade pip +python -m pip install --upgrade wheel python -m pip install --upgrade . ``` Note that after running the above command we get some dependency conflicts in diff --git a/docs/setting_region.md b/docs/setting_region.md index a3ba245dc..709dcf870 100644 --- a/docs/setting_region.md +++ b/docs/setting_region.md @@ -54,7 +54,7 @@ If you are running Variant Transforms from GitHub, you need to specify all three as below. ```bash -python -m gcp_variant_transforms.vcf_to_bq \ +python3 -m gcp_variant_transforms.vcf_to_bq \ ... \ --project "${GOOGLE_CLOUD_PROJECT}" \ --region europe-west1 \ diff --git a/docs/vcf_files_preprocessor.md b/docs/vcf_files_preprocessor.md index 0e88dae59..b23d43ea0 100644 --- a/docs/vcf_files_preprocessor.md +++ b/docs/vcf_files_preprocessor.md @@ -100,7 +100,7 @@ source. Example command for DirectRunner: ```bash -python -m gcp_variant_transforms.vcf_to_bq_preprocess \ +python3 -m gcp_variant_transforms.vcf_to_bq_preprocess \ --input_pattern gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf \ --report_path gs://BUCKET/report.tsv --job_name vcf-to-bigquery-preprocess-direct-runner \ @@ -112,7 +112,7 @@ python -m gcp_variant_transforms.vcf_to_bq_preprocess \ Example command for DataflowRunner: ```bash -python -m gcp_variant_transforms.vcf_to_bq_preprocess \ +python3 -m gcp_variant_transforms.vcf_to_bq_preprocess \ --input_pattern gs://BUCKET/*.vcf \ --report_path gs://BUCKET/report.tsv \ --job_name vcf-to-bigquery-preprocess \ diff --git a/gcp_variant_transforms/beam_io/bgzf.py b/gcp_variant_transforms/beam_io/bgzf.py index 68cfa6869..f949e06aa 100644 --- a/gcp_variant_transforms/beam_io/bgzf.py +++ b/gcp_variant_transforms/beam_io/bgzf.py @@ -55,7 +55,7 @@ class BGZF(filesystem.CompressedFile): def __init__(self, fileobj, compression_type=filesystem.CompressionTypes.GZIP): - super(BGZF, self).__init__(fileobj, compression_type) + super().__init__(fileobj, compression_type) self._first_fetch = True def _fetch_to_internal_buffer(self, num_bytes): @@ -119,7 +119,7 @@ def __init__(self, validate=True ): """A source for reading BGZF Block.""" - super(BGZFBlockSource, self).__init__( + super().__init__( file_name, min_bundle_size, compression_type, @@ -147,7 +147,7 @@ def read_records(self, file_name, _): record = file_to_read.readline() if not record or not record.strip(): break - if record and not record.startswith('#'): + if record and not record.startswith(b'#'): yield self._coder.decode(record) @@ -164,15 +164,15 @@ def __init__(self, fileobj, block, compression_type=filesystem.CompressionTypes.GZIP): - super(BGZFBlock, self).__init__(fileobj, - compression_type) + super().__init__(fileobj, + compression_type) self._block = block self._start_offset = self._block.start def _fetch_and_decompress_data_to_buffer(self, num_bytes): if self._first_fetch: self._read_first_gzip_block_into_buffer() - super(BGZFBlock, self)._fetch_and_decompress_data_to_buffer(num_bytes) + super()._fetch_and_decompress_data_to_buffer(num_bytes) if self._read_eof: self._complete_last_line() @@ -181,7 +181,7 @@ def _read_first_gzip_block_into_buffer(self): decompressed = self._decompressor.decompress(buf) del buf # Discards all data before first `\n`. - while '\n' not in decompressed: + while b'\n' not in decompressed: if self._decompressor.unused_data != b'': buf = self._decompressor.unused_data self._decompressor = zlib.decompressobj(self._gzip_mask) @@ -191,12 +191,12 @@ def _read_first_gzip_block_into_buffer(self): raise ValueError('Read failed. The block {} does not contain any ' 'valid record.'.format(self._block)) - lines = decompressed.split('\n') - self._read_buffer.write('\n'.join(lines[1:])) + lines = decompressed.split(b'\n') + self._read_buffer.write(b'\n'.join(lines[1:])) def _read_data_from_source(self): if self._start_offset == self._block.end: - return '' + return b'' buf = self._file.raw._downloader.get_range(self._start_offset, self._block.end) self._start_offset += len(buf) @@ -212,7 +212,7 @@ def _complete_last_line(self): if not decompressed: return # Writes all data to the buffer until the first `\n` is reached. - while '\n' not in decompressed: + while b'\n' not in decompressed: if self._decompressor.unused_data != b'': self._read_buffer.write(decompressed) buf = self._decompressor.unused_data @@ -222,4 +222,4 @@ def _complete_last_line(self): else: raise ValueError('Read failed. The record is longer than {} ' 'bytes.'.format(self._read_size)) - self._read_buffer.write(decompressed.split('\n')[0] + '\n') + self._read_buffer.write(decompressed.split(b'\n')[0] + b'\n') diff --git a/gcp_variant_transforms/beam_io/bgzf_io.py b/gcp_variant_transforms/beam_io/bgzf_io.py index 9e9a28543..f5d62f234 100644 --- a/gcp_variant_transforms/beam_io/bgzf_io.py +++ b/gcp_variant_transforms/beam_io/bgzf_io.py @@ -116,7 +116,7 @@ def _read(index_file): if not line: break lines.append(line) - return ''.join(lines) + return b''.join(lines) def _remove_invalid_blocks(blocks): diff --git a/gcp_variant_transforms/beam_io/bgzf_test.py b/gcp_variant_transforms/beam_io/bgzf_test.py index a4c4791d7..f283de32e 100644 --- a/gcp_variant_transforms/beam_io/bgzf_test.py +++ b/gcp_variant_transforms/beam_io/bgzf_test.py @@ -27,9 +27,10 @@ class BgzfBlockTest(unittest.TestCase): def setUp(self): - with open(testdata_util.get_full_file_path('Y.vcf.bgz')) as file_to_read: + with open(testdata_util.get_full_file_path('Y.vcf.bgz'), + mode='rb') as file_to_read: data = file_to_read.readlines() - self._data = ''.join(data) + self._data = b''.join(data) self.client = gcsio_test.FakeGcsClient() self.gcs = gcsio.GcsIO(self.client) self._file_name = 'gs://bucket/test' @@ -76,12 +77,12 @@ def _read_all_lines(self, file_to_read): line = file_to_read.readline() if not line: break - lines.append(line) + lines.append(line.decode('utf-8')) return lines def _validate_first_line_is_complete(self, line): self.assertEqual( - line, + line.decode('utf-8'), '##INFO=\n') diff --git a/gcp_variant_transforms/beam_io/vcf_estimate_io.py b/gcp_variant_transforms/beam_io/vcf_estimate_io.py index cb11071ca..7c0a2ed7b 100644 --- a/gcp_variant_transforms/beam_io/vcf_estimate_io.py +++ b/gcp_variant_transforms/beam_io/vcf_estimate_io.py @@ -15,7 +15,7 @@ """A source for reading VCF files and extracting signals about input size.""" -from __future__ import absolute_import + from functools import partial from typing import Dict, Iterable # pylint: disable=unused-import @@ -28,7 +28,7 @@ from apache_beam.io import range_trackers # pylint: disable=unused-import -class VcfEstimate(object): +class VcfEstimate(): """Container for estimation data about the VCF file.""" def __init__(self, @@ -74,23 +74,23 @@ def __init__(self, compression_type=filesystem.CompressionTypes.AUTO, validate=True): # type: (str, str, bool) -> None - super(VcfEstimateSource, self).__init__(file_pattern, - compression_type=compression_type, - validate=validate, - splittable=False) + super().__init__(file_pattern, + compression_type=compression_type, + validate=validate, + splittable=False) self._compression_type = compression_type def _get_header_info(self, file_to_read, file_name): # type: (str, str) -> (int, str) """Returns the header size and sample names.""" header_size = 0 - header_line = file_to_read.readline() + header_line = file_to_read.readline().decode('utf-8') # Read and skip all header lines starting with ##. Make sure to calculate # their total size, to marginally better approximate the line count. while (header_line.startswith('##') or not header_line or not header_line.strip()): header_size += len(header_line) - header_line = file_to_read.readline() + header_line = file_to_read.readline().decode('utf-8') if not header_line.startswith('#'): raise ValueError(('No column-defining header line was found in file {}.' .format(file_name))) @@ -170,7 +170,7 @@ def __init__( validate: Flag to verify that the files exist during the pipeline creation time. """ - super(GetEstimates, self).__init__(**kwargs) + super().__init__(**kwargs) self._source = VcfEstimateSource( file_pattern, compression_type, @@ -212,7 +212,7 @@ def __init__( `, in which case the underlying file_path's extension will be used to detect the compression. """ - super(GetAllEstimates, self).__init__(**kwargs) + super().__init__(**kwargs) source_from_file = partial( _create_vcf_estimate_source, compression_type=compression_type) diff --git a/gcp_variant_transforms/beam_io/vcf_header_io.py b/gcp_variant_transforms/beam_io/vcf_header_io.py index ac9a8efc3..aedeb861e 100644 --- a/gcp_variant_transforms/beam_io/vcf_header_io.py +++ b/gcp_variant_transforms/beam_io/vcf_header_io.py @@ -14,7 +14,6 @@ """A source for reading VCF file headers.""" -from __future__ import absolute_import import collections from functools import partial @@ -44,7 +43,7 @@ FILE_FORMAT_HEADER_TEMPLATE = '##fileformat=VCFv{VERSION}' -class VcfHeaderFieldTypeConstants(object): +class VcfHeaderFieldTypeConstants(): """Constants for types from VCF header.""" FLOAT = 'Float' INTEGER = 'Integer' @@ -53,7 +52,7 @@ class VcfHeaderFieldTypeConstants(object): CHARACTER = 'Character' -class VcfParserHeaderKeyConstants(object): +class VcfParserHeaderKeyConstants(): """Constants for header fields from the parser.""" ID = 'id' NUM = 'num' @@ -64,7 +63,7 @@ class VcfParserHeaderKeyConstants(object): LENGTH = 'length' -class PysamHeaderKeyConstants(object): +class PysamHeaderKeyConstants(): """Constants for header fields from the parser.""" NUM = 'Number' TYPE = 'Type' @@ -104,7 +103,7 @@ def CreateFormatField(info_id, number, info_type, description=''): VariantHeaderMetadataMock = collections.namedtuple( 'VariantHeaderMetadata', ['id', 'record']) -class VcfHeader(object): +class VcfHeader(): """Container for header data.""" def __init__(self, @@ -257,7 +256,7 @@ def _get_samples(self, sample_line): def _verify_header(self, fields, is_format): # type: (Dict[str, VariantHeaderMetadata], bool) -> None """Verifies the integrity of INFO and FORMAT fields""" - for header_id, field in fields.iteritems(): + for header_id, field in fields.items(): # ID, Description, Type and Number are mandatory fields. if not header_id: raise ValueError('Corrupt ID at header line {}.'.format(field.id)) @@ -271,12 +270,13 @@ def _verify_header(self, fields, is_format): # Number can only be a number or one of 'A', 'R', 'G' and '.'. if PysamHeaderKeyConstants.NUM not in field.record: raise ValueError('No number for header line {}.'.format(field.id)) - elif (field.record[PysamHeaderKeyConstants.NUM] not in - HEADER_SPECIAL_NUMBERS): + if (field.record[PysamHeaderKeyConstants.NUM] not in + HEADER_SPECIAL_NUMBERS): try: int(field.record[PysamHeaderKeyConstants.NUM]) - except ValueError: - raise ValueError('Unknown Number at header line {}.'.format(field.id)) + except ValueError as e: + raise ValueError( + 'Unknown Number at header line {}.'.format(field.id)) from e class VcfHeaderSource(filebasedsource.FileBasedSource): @@ -287,10 +287,10 @@ def __init__(self, compression_type=CompressionTypes.AUTO, validate=True): # type: (str, str, bool) -> None - super(VcfHeaderSource, self).__init__(file_pattern, - compression_type=compression_type, - validate=validate, - splittable=False) + super().__init__(file_pattern, + compression_type=compression_type, + validate=validate, + splittable=False) self._compression_type = compression_type def read_records( @@ -334,9 +334,9 @@ def _read_headers_plus_one_record(self, file_path): with self.open_file(file_path) as file_to_read: record = None while True: - record = file_to_read.readline() + record = file_to_read.readline().decode('utf-8') while record and not record.strip(): # Skip empty lines. - record = file_to_read.readline() + record = file_to_read.readline().decode('utf-8') if record and record.startswith('#'): yield record.strip() else: @@ -376,7 +376,7 @@ def __init__( validate: Flag to verify that the files exist during the pipeline creation time. """ - super(ReadVcfHeaders, self).__init__(**kwargs) + super().__init__(**kwargs) self._source = VcfHeaderSource( file_pattern, compression_type, @@ -424,7 +424,7 @@ def __init__( `, in which case the underlying file_path's extension will be used to detect the compression. """ - super(ReadAllVcfHeaders, self).__init__(**kwargs) + super().__init__(**kwargs) source_from_file = partial( CreateVcfHeaderSource, compression_type=compression_type) @@ -438,7 +438,7 @@ def expand(self, pvalue): return pvalue | 'ReadAllFiles' >> self._read_all_files -class HeaderTypeConstants(object): +class HeaderTypeConstants(): INFO = 'INFO' FILTER = 'FILTER' ALT = 'ALT' @@ -446,7 +446,7 @@ class HeaderTypeConstants(object): CONTIG = 'contig' -class _HeaderFieldKeyConstants(object): +class _HeaderFieldKeyConstants(): ID = 'ID' NUMBER = 'Number' TYPE = 'Type' @@ -460,7 +460,7 @@ class WriteVcfHeaderFn(beam.DoFn): """A DoFn for writing VCF headers to a file.""" HEADER_TEMPLATE = '##{}=<{}>\n' - FINAL_HEADER_LINE = '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n' + FINAL_HEADER_LINE = b'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n' def __init__(self, file_path): # type: (str) -> None @@ -471,7 +471,7 @@ def process(self, header, vcf_version_line=None): # type: (VcfHeader, str) -> None with FileSystems.create(self._file_path) as self._file_to_write: if vcf_version_line: - self._file_to_write.write(vcf_version_line) + self._file_to_write.write(vcf_version_line.encode('utf-8')) self._write_headers_by_type(HeaderTypeConstants.INFO, header.infos) self._write_headers_by_type(HeaderTypeConstants.FILTER, header.filters) self._write_headers_by_type(HeaderTypeConstants.ALT, header.alts) @@ -490,7 +490,7 @@ def _write_headers_by_type(self, header_type, headers): """ for header in headers.values(): self._file_to_write.write( - self._to_vcf_header_line(header_type, header)) + self._to_vcf_header_line(header_type, header).encode('utf-8')) def _to_vcf_header_line(self, header_type, header): # type: (str, Dict[str, Union[str, int]]) -> str @@ -519,13 +519,13 @@ def _format_header(self, header): A formatted string composed of header keys and values. """ formatted_values = [] - for key, value in header.iteritems(): + for key, value in header.items(): if self._should_include_key_value(key, value): formatted_values.append(self._format_header_key_value(key, value)) return ','.join(formatted_values) def _should_include_key_value(self, key, value): - return value is not None or (key != 'source' and key != 'version') + return value is not None or (key not in ('source', 'version')) def _format_header_key_value(self, key, value): # type: (str, Union[str, int]) -> str @@ -544,9 +544,9 @@ def _format_header_key_value(self, key, value): value = vcfio.MISSING_FIELD_VALUE elif key == _HeaderFieldKeyConstants.NUMBER: value = self._format_number(value) - elif (key == _HeaderFieldKeyConstants.DESCRIPTION - or key == _HeaderFieldKeyConstants.SOURCE - or key == _HeaderFieldKeyConstants.VERSION): + elif (key in (_HeaderFieldKeyConstants.DESCRIPTION, + _HeaderFieldKeyConstants.SOURCE, + _HeaderFieldKeyConstants.VERSION)): value = self._format_string_value(value) return '{}={}'.format(key, value) @@ -589,9 +589,9 @@ def _format_number(self, number): raise ValueError('Invalid value for number: {}'.format(number)) def _format_string_value(self, value): - # type: (str, unicode) -> str - if isinstance(value, unicode): - return '"{}"'.format(value.encode('utf-8')) + # type: (str, bytes) -> str + if isinstance(value, bytes): + return '"{}"'.format(value.decode('utf-8')) return '"{}"'.format(value) diff --git a/gcp_variant_transforms/beam_io/vcf_header_io_test.py b/gcp_variant_transforms/beam_io/vcf_header_io_test.py index 2b8fb239e..05e8f91cf 100644 --- a/gcp_variant_transforms/beam_io/vcf_header_io_test.py +++ b/gcp_variant_transforms/beam_io/vcf_header_io_test.py @@ -56,8 +56,7 @@ def _get_vcf_header_from_lines(lines, file_name=None): if line.startswith(LAST_HEADER_LINE_PREFIX): sample_line = line.strip() break - else: - header.add_line(line.strip()) + header.add_line(line.strip()) else: break return VcfHeader(infos=header.info, @@ -126,11 +125,12 @@ def test_all_fields(self): '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GS000016676-ASM\n', ] header = self._create_file_and_read_headers() - self.assertItemsEqual(header.contigs.keys(), ['M', 'P']) - self.assertItemsEqual(header.alts.keys(), ['CGA_CNVWIN', 'INS:ME:MER']) - self.assertItemsEqual(header.filters.keys(), ['MPCBT']) - self.assertItemsEqual(header.infos.keys(), ['CGA_MIRB']) - self.assertItemsEqual(header.formats.keys(), ['FT']) + self.assertCountEqual(list(header.contigs.keys()), ['M', 'P']) + self.assertCountEqual( + list(header.alts.keys()), ['CGA_CNVWIN', 'INS:ME:MER']) + self.assertCountEqual(list(header.filters.keys()), ['MPCBT']) + self.assertCountEqual(list(header.infos.keys()), ['CGA_MIRB']) + self.assertCountEqual(list(header.formats.keys()), ['FT']) def test_empty_header_raises_error(self): self.lines = testdata_util.get_sample_vcf_record_lines() @@ -285,7 +285,8 @@ def test_info_source_and_version(self): ] header = _get_vcf_header_from_lines(self.lines) header_fn = WriteVcfHeaderFn('') - actual = header_fn._to_vcf_header_line('INFO', header.infos.values()[0]) + actual = header_fn._to_vcf_header_line( + 'INFO', list(header.infos.values())[0]) expected = self.lines[0] self.assertEqual(actual, expected) @@ -296,7 +297,8 @@ def test_write_contig(self): ] header = _get_vcf_header_from_lines(self.lines) header_fn = WriteVcfHeaderFn('') - actual = header_fn._to_vcf_header_line('contig', header.contigs.values()[0]) + actual = header_fn._to_vcf_header_line( + 'contig', list(header.contigs.values())[0]) expected = '##contig=\n' self.assertEqual(actual, expected) @@ -311,10 +313,10 @@ def test_write_info_number_types(self): header = _get_vcf_header_from_lines(self.lines) header_fn = WriteVcfHeaderFn('') actual = [] - for info in header.infos.values(): + for info in list(header.infos.values()): actual.append(header_fn._to_vcf_header_line('INFO', info)) expected = self.lines[:-1] - self.assertItemsEqual(actual, expected) + self.assertCountEqual(actual, expected) def test_write_headers(self): header = _get_vcf_header_from_lines(self.lines) @@ -328,16 +330,16 @@ def test_write_headers_with_vcf_version_line(self): header = _get_vcf_header_from_lines(self.lines) vcf_version_line = '##fileformat=VCFv4.3\n' expected_results = [ - vcf_version_line, - '##INFO=\n', - '##INFO=\n', - '##INFO=\n', - '##INFO=\n', - '##FILTER=\n', - '##ALT=\n', - '##FORMAT=\n', - '##FORMAT=\n', - '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n' + vcf_version_line.encode('utf-8'), + b'##INFO=\n', + b'##INFO=\n', + b'##INFO=\n', + b'##INFO=\n', + b'##FILTER=\n', + b'##ALT=\n', + b'##FORMAT=\n', + b'##FORMAT=\n', + b'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT\n' ] with temp_dir.TempDir() as tempdir: tempfile = tempdir.create_temp_file(suffix='.vcf') @@ -345,18 +347,18 @@ def test_write_headers_with_vcf_version_line(self): header_fn.process(header, vcf_version_line) with open(tempfile, 'rb') as f: actual = f.readlines() - self.assertItemsEqual(actual, expected_results) + self.assertCountEqual(actual, expected_results) def _remove_sample_names(self, line): # Return line with all columns except sample names. - return '\t'.join(line.split('\t')[:9]) + return b'\t'.join(line.split(b'\t')[:9]) def _assert_file_contents_equal(self, file_name, lines): with open(file_name, 'rb') as f: actual = f.read().splitlines() - expected = [s.strip() for s in lines[1:]] + expected = [s.strip().encode('utf-8') for s in lines[1:]] expected[-1] = self._remove_sample_names(expected[-1]) - self.assertItemsEqual(actual, expected) + self.assertCountEqual(actual, expected) def test_write_dataflow(self): header = _get_vcf_header_from_lines(self.lines) diff --git a/gcp_variant_transforms/beam_io/vcf_parser.py b/gcp_variant_transforms/beam_io/vcf_parser.py index b1bdd0a97..ba786a418 100644 --- a/gcp_variant_transforms/beam_io/vcf_parser.py +++ b/gcp_variant_transforms/beam_io/vcf_parser.py @@ -17,7 +17,7 @@ The 4.2 spec is available at https://samtools.github.io/hts-specs/VCFv4.2.pdf. """ -from __future__ import absolute_import + from collections import namedtuple import enum @@ -65,7 +65,7 @@ class SampleNameEncoding(enum.Enum): NONE = 2 -class Variant(object): +class Variant(): """A class to store info about a genomic variant. Each object corresponds to a single record in a VCF file. @@ -152,7 +152,8 @@ def __lt__(self, other): other_vars = vars(other) for key in sorted(self_vars): if self_vars[key] != other_vars[key]: - return self_vars[key] < other_vars[key] + return (other_vars[key] is not None and + (self_vars[key] is None or self_vars[key] < other_vars[key])) return False @@ -178,7 +179,7 @@ def __ge__(self, other): return other <= self -class VariantCall(object): +class VariantCall(): """A class to store info about a variant call. A call represents the determination of genotype with respect to a particular @@ -242,7 +243,7 @@ def __repr__(self): self.sample_id, self.genotype, self.phaseset, self.info]]) -class VcfParser(object): +class VcfParser(): """Base abstract class for defining a VCF file parser. Derived classes must implement two methods: @@ -361,16 +362,16 @@ def _process_header_lines(self, header_lines): parsed_line = (line.strip().replace('Number=G', 'Number=.') if line.startswith(INFO_HEADER_TAG) else line.strip()) # Tests provide lines in unicode. - if isinstance(parsed_line, str): + if isinstance(parsed_line, bytes): parsed_line = parsed_line.decode('utf-8') if parsed_line: # For str cases, decode then re-encode lines in utf-8, to not use ascii # encoding. - parsed_header_lines.append(parsed_line.encode('utf-8')) + parsed_header_lines.append(parsed_line) self._init_with_header(parsed_header_lines) - def next(self): + def __next__(self): try: text_line = self._next_non_empty_line(self._text_lines) except StopIteration as e: @@ -437,17 +438,17 @@ def __init__( **kwargs # type: **str ): # type: (...) -> None - super(PySamParser, self).__init__(file_name, - range_tracker, - file_pattern, - compression_type, - allow_malformed_records, - representative_header_lines, - splittable_bgzf, - pre_infer_headers, - sample_name_encoding, - use_1_based_coordinate, - **kwargs) + super().__init__(file_name, + range_tracker, + file_pattern, + compression_type, + allow_malformed_records, + representative_header_lines, + splittable_bgzf, + pre_infer_headers, + sample_name_encoding, + use_1_based_coordinate, + **kwargs) # These members will be properly initiated in _init_parent_process(). self._vcf_reader = None self._to_child = None @@ -460,7 +461,6 @@ def send_kill_signal_to_child(self): self._to_child.flush() self._to_child.close() os.waitpid(self._process_pid, 0) - return def _init_parent_process(self, return_pipe_read, send_pipe_write): from_child = os.fdopen(return_pipe_read) @@ -516,7 +516,7 @@ def _init_with_header(self, header_lines): def _get_variant(self, data_line): try: - self._to_child.write(data_line.encode('utf-8') + '\n') + self._to_child.write(data_line + '\n') self._to_child.flush() return self._convert_to_variant(next(self._vcf_reader)) except (ValueError, StopIteration, TypeError) as e: @@ -548,7 +548,7 @@ def _convert_to_variant(self, record): """ self._verify_start_end(record) return Variant( - reference_name=record.chrom.encode('utf-8'), + reference_name=record.chrom, # record.pos is 1-based version of record.start (ie. record.start + 1). start=record.pos if self._use_1_based_coordinate else record.start, end=record.stop, @@ -612,8 +612,8 @@ def _convert_field(self, value, is_phaseset=False): if isinstance(value, float): return self._parse_float(value) # Sometimes PySam returns unicode strings, encode them as strings instead. - elif isinstance(value, unicode): - value = value.encode('utf-8') + elif isinstance(value, bytes): + value = value.decode('utf-8') return str(value) def _lookup_encoded_sample_name(self, sample_name): diff --git a/gcp_variant_transforms/beam_io/vcf_parser_test.py b/gcp_variant_transforms/beam_io/vcf_parser_test.py index 2ed772070..ff01f2fb5 100644 --- a/gcp_variant_transforms/beam_io/vcf_parser_test.py +++ b/gcp_variant_transforms/beam_io/vcf_parser_test.py @@ -14,7 +14,6 @@ """Tests for vcf_parser module.""" -from __future__ import absolute_import import logging import unittest diff --git a/gcp_variant_transforms/beam_io/vcfio.py b/gcp_variant_transforms/beam_io/vcfio.py index bd71af3c2..9859dda13 100644 --- a/gcp_variant_transforms/beam_io/vcfio.py +++ b/gcp_variant_transforms/beam_io/vcfio.py @@ -17,7 +17,6 @@ The 4.2 spec is available at https://samtools.github.io/hts-specs/VCFv4.2.pdf. """ -from __future__ import absolute_import from typing import Any, Iterable, List, Tuple # pylint: disable=unused-import from functools import partial @@ -65,7 +64,7 @@ def __init__(self, bq_uses_1_based_coordinate): self.bq_uses_1_based_coordinate = bq_uses_1_based_coordinate def encode(self, variant): - # type: (Variant) -> str + # type: (Variant) -> bytes """Converts a :class:`Variant` object back to a VCF line.""" encoded_info = self._encode_variant_info(variant) format_keys = self._get_variant_format_keys(variant) @@ -87,7 +86,7 @@ def encode(self, variant): columns.append(encoded_calls) columns = [self._encode_value(c) for c in columns] - return '\t'.join(columns) + '\n' + return ('\t'.join(columns) + '\n').encode('utf-8') def _encode_value(self, value): # type: (Any) -> str @@ -96,7 +95,7 @@ def _encode_value(self, value): return MISSING_FIELD_VALUE elif isinstance(value, list): return ','.join([self._encode_value(x) for x in value]) - return value.encode('utf-8') if isinstance(value, unicode) else str(value) + return value.decode('utf-8') if isinstance(value, bytes) else str(value) def _encode_variant_info(self, variant): """Encodes the info of a :class:`Variant` for a VCF file line.""" @@ -112,7 +111,7 @@ def _encode_variant_info(self, variant): and start_0_based + len(variant.reference_bases) != variant.end): encoded_infos.append('END=%d' % variant.end) # Set all other fields of info. - for k, v in variant.info.iteritems(): + for k, v in variant.info.items(): if v is True: encoded_infos.append(k) else: @@ -130,7 +129,7 @@ def _get_variant_format_keys(self, variant): # the key will be added to the format field. if self._is_alternate_phaseset(call.phaseset): format_keys.append(PHASESET_FORMAT_KEY) - format_keys.extend([k for k in call.info]) + format_keys.extend(list(k for k in call.info)) # Sort all keys and remove duplicates after GENOTYPE_FORMAT_KEY format_keys[1:] = sorted(list(set(format_keys[1:]))) @@ -211,9 +210,9 @@ def __init__( use_1_based_coordinate=False # type: bool ): # type: (...) -> None - super(_VcfSource, self).__init__(file_pattern, - compression_type=compression_type, - validate=validate) + super().__init__(file_pattern, + compression_type=compression_type, + validate=validate) self._representative_header_lines = representative_header_lines self._compression_type = compression_type self._buffer_size = buffer_size @@ -280,9 +279,10 @@ def __init__(self, self._sample_name_encoding = sample_name_encoding self._use_1_based_coordinate = use_1_based_coordinate - def _read_records(self, (file_path, block)): + def _read_records(self, file_path_and_block_tuple): # type: (Tuple[str, Block]) -> Iterable(Variant) """Reads records from `file_path` in `block`.""" + (file_path, block) = file_path_and_block_tuple record_iterator = vcf_parser.PySamParser( file_path, block, @@ -348,7 +348,7 @@ def __init__( use_1_based_coordinate: specify whether the coordinates should be stored in BQ using 0-based exclusive (default) or 1-based inclusive coordinate. """ - super(ReadFromVcf, self).__init__(**kwargs) + super().__init__(**kwargs) self._source = _VcfSource( file_pattern, @@ -426,7 +426,7 @@ def __init__( use_1_based_coordinate: specify whether the coordinates should be stored in BQ using 0-based exclusive (default) or 1-based inclusive coordinate. """ - super(ReadAllFromVcf, self).__init__(**kwargs) + super().__init__(**kwargs) source_from_file = partial( _create_vcf_source, representative_header_lines=representative_header_lines, @@ -508,8 +508,9 @@ def __init__(self, bq_uses_1_based_coordinate): """ self._coder = _ToVcfRecordCoder(bq_uses_1_based_coordinate) - def process(self, (file_path, variants), *args, **kwargs): + def process(self, file_path_and_variants_tuple, *args, **kwargs): # pylint: disable=unused-argument # type: (Tuple[str, List[Variant]]) -> None + (file_path, variants) = file_path_and_variants_tuple with filesystems.FileSystems.create(file_path) as file_to_write: for variant in variants: file_to_write.write(self._coder.encode(variant)) diff --git a/gcp_variant_transforms/beam_io/vcfio_test.py b/gcp_variant_transforms/beam_io/vcfio_test.py index ad774f02a..9ec8ce286 100644 --- a/gcp_variant_transforms/beam_io/vcfio_test.py +++ b/gcp_variant_transforms/beam_io/vcfio_test.py @@ -15,7 +15,6 @@ """Tests for vcfio module.""" -from __future__ import absolute_import import glob import gzip @@ -398,12 +397,11 @@ def test_single_file_verify_details_without_encoding(self): self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data) - @unittest.skipIf(VCF_FILE_DIR_MISSING, 'VCF test file directory is missing') def test_read_after_splitting(self): file_name = testdata_util.get_full_file_path('valid-4.1-large.vcf') source = VcfSource(file_name) - splits = [p for p in source.split(desired_bundle_size=500)] + splits = list(p for p in source.split(desired_bundle_size=500)) self.assertGreater(len(splits), 1) sources_info = ([ (split.source, split.start_position, split.stop_position) for @@ -803,7 +801,7 @@ def test_read_reentrant_after_splitting(self): file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) - splits = [split for split in source.split(desired_bundle_size=100000)] + splits = list(split for split in source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_reentrant_reads_succeed( (splits[0].source, splits[0].start_position, splits[0].stop_position)) @@ -813,7 +811,7 @@ def test_dynamic_work_rebalancing(self): file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) - splits = [split for split in source.split(desired_bundle_size=100000)] + splits = list(split for split in source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position) @@ -822,13 +820,13 @@ def test_dynamic_work_rebalancing(self): class VcfSinkTest(unittest.TestCase): def setUp(self): - super(VcfSinkTest, self).setUp() + super().setUp() self.path = tempfile.NamedTemporaryFile(suffix='.vcf').name - self.variants, self.variant_lines = zip( + self.variants, self.variant_lines = list(zip( (_get_sample_variant_1(), VCF_LINE_1), (_get_sample_variant_2(), VCF_LINE_2), (_get_sample_variant_3(), VCF_LINE_3), - (_get_sample_non_variant(), GVCF_LINE)) + (_get_sample_non_variant(), GVCF_LINE))) def _assert_variant_lines_equal(self, actual, expected): actual_fields = actual.strip().split('\t') @@ -837,17 +835,17 @@ def _assert_variant_lines_equal(self, actual, expected): self.assertEqual(len(actual_fields), len(expected_fields)) self.assertEqual(actual_fields[0], expected_fields[0]) self.assertEqual(actual_fields[1], expected_fields[1]) - self.assertItemsEqual(actual_fields[2].split(';'), + self.assertCountEqual(actual_fields[2].split(';'), expected_fields[2].split(';')) self.assertEqual(actual_fields[3], expected_fields[3]) - self.assertItemsEqual(actual_fields[4].split(','), + self.assertCountEqual(actual_fields[4].split(','), expected_fields[4].split(',')) self.assertEqual(actual_fields[5], actual_fields[5]) - self.assertItemsEqual(actual_fields[6].split(';'), + self.assertCountEqual(actual_fields[6].split(';'), expected_fields[6].split(';')) - self.assertItemsEqual(actual_fields[7].split(';'), + self.assertCountEqual(actual_fields[7].split(';'), expected_fields[7].split(';')) - self.assertItemsEqual(actual_fields[8].split(':'), + self.assertCountEqual(actual_fields[8].split(':'), expected_fields[8].split(':')) # Assert calls are the same @@ -857,7 +855,7 @@ def _assert_variant_lines_equal(self, actual, expected): # Compare the first and third values of the GT field self.assertEqual(actual_split[0], expected_split[0]) # Compare the rest of the items ignoring order - self.assertItemsEqual(actual_split[1:], expected_split[1:]) + self.assertCountEqual(actual_split[1:], expected_split[1:]) def _get_coder(self, bq_uses_1_based_coordinate=False): return vcfio._ToVcfRecordCoder(bq_uses_1_based_coordinate) @@ -866,11 +864,11 @@ def test_to_vcf_line_0_based(self): coder = self._get_coder() for variant, line in zip(self.variants, self.variant_lines): self._assert_variant_lines_equal( - coder.encode(variant), line) + coder.encode(variant).decode('utf-8'), line) empty_variant = vcfio.Variant() empty_line = '\t'.join(['.' for _ in range(9)]) self._assert_variant_lines_equal( - coder.encode(empty_variant), empty_line) + coder.encode(empty_variant).decode('utf-8'), empty_line) def test_to_vcf_line_1_based(self): coder = self._get_coder(bq_uses_1_based_coordinate=True) @@ -881,11 +879,11 @@ def test_to_vcf_line_1_based(self): _get_sample_non_variant(use_1_based_coordinate=True)] for variant, line in zip(variants, self.variant_lines): self._assert_variant_lines_equal( - coder.encode(variant), line) + coder.encode(variant).decode('utf-8'), line) empty_variant = vcfio.Variant() empty_line = '\t'.join(['.' for _ in range(9)]) self._assert_variant_lines_equal( - coder.encode(empty_variant), empty_line) + coder.encode(empty_variant).decode('utf-8'), empty_line) def test_missing_info_key(self): coder = self._get_coder() @@ -898,7 +896,8 @@ def test_missing_info_key(self): expected = ('. . . . . . . . GT:AF:GQ 0/1:20:10 ' '0/1:20:.\n') - self._assert_variant_lines_equal(coder.encode(variant), expected) + self._assert_variant_lines_equal( + coder.encode(variant).decode('utf-8'), expected) def test_info_list(self): coder = self._get_coder() @@ -908,7 +907,8 @@ def test_info_list(self): info={'LI': [1, None, 3]})) expected = '. . . . . . . . GT:LI 0/1:1,.,3\n' - self._assert_variant_lines_equal(coder.encode(variant), expected) + self._assert_variant_lines_equal( + coder.encode(variant).decode('utf-8'), expected) def test_info_field_count(self): coder = self._get_coder() @@ -921,7 +921,8 @@ def test_info_field_count(self): expected = ('. . . . . . . NS=3;AF=0.333,0.667;DB;' 'CSQ=G|upstream_gene_variant||MODIFIER,T|||MODIFIER .\n') - self._assert_variant_lines_equal(coder.encode(variant), expected) + self._assert_variant_lines_equal( + coder.encode(variant).decode('utf-8'), expected) def test_empty_sample_calls(self): coder = self._get_coder() @@ -929,7 +930,8 @@ def test_empty_sample_calls(self): variant.calls.append( VariantCall(sample_id=hash_name('Sample2'), genotype=-1)) expected = '. . . . . . . . GT .\n' - self._assert_variant_lines_equal(coder.encode(variant), expected) + self._assert_variant_lines_equal( + coder.encode(variant).decode('utf-8'), expected) def test_missing_genotype(self): coder = self._get_coder() @@ -939,7 +941,8 @@ def test_missing_genotype(self): genotype=[1, vcfio.MISSING_GENOTYPE_VALUE])) expected = '. . . . . . . . GT 1/.\n' - self._assert_variant_lines_equal(coder.encode(variant), expected) + self._assert_variant_lines_equal( + coder.encode(variant).decode('utf-8'), expected) def test_triploid_genotype(self): coder = self._get_coder() @@ -948,7 +951,8 @@ def test_triploid_genotype(self): sample_id=hash_name('Sample'), genotype=[1, 0, 1])) expected = '. . . . . . . . GT 1/0/1\n' - self._assert_variant_lines_equal(coder.encode(variant), expected) + self._assert_variant_lines_equal( + coder.encode(variant).decode('utf-8'), expected) def test_write_dataflow_0_based(self): pipeline = TestPipeline() @@ -999,7 +1003,7 @@ def test_write_dataflow_auto_compression(self): read_result.extend(f.read().splitlines()) for actual, expected in zip(read_result, self.variant_lines): - self._assert_variant_lines_equal(actual, expected) + self._assert_variant_lines_equal(actual.decode('utf-8'), expected) def test_write_dataflow_header(self): pipeline = TestPipeline() @@ -1017,9 +1021,9 @@ def test_write_dataflow_header(self): with gzip.GzipFile(file_name, 'r') as f: read_result.extend(f.read().splitlines()) - self.assertEqual(read_result[0], 'foo') + self.assertEqual(read_result[0].decode('utf-8'), 'foo') for actual, expected in zip(read_result[1:], self.variant_lines): - self._assert_variant_lines_equal(actual, expected) + self._assert_variant_lines_equal(actual.decode('utf-8'), expected) if __name__ == '__main__': diff --git a/gcp_variant_transforms/bq_to_vcf.py b/gcp_variant_transforms/bq_to_vcf.py index 2e4b4bf58..18870177a 100644 --- a/gcp_variant_transforms/bq_to_vcf.py +++ b/gcp_variant_transforms/bq_to_vcf.py @@ -39,8 +39,6 @@ --runner DataflowRunner """ -from __future__ import absolute_import -from __future__ import division import logging import sys @@ -64,6 +62,7 @@ from gcp_variant_transforms.libs import genomic_region_parser from gcp_variant_transforms.libs import sample_info_table_schema_generator from gcp_variant_transforms.libs import vcf_file_composer +from gcp_variant_transforms.libs import vcf_header_parser from gcp_variant_transforms.options import variant_transform_options from gcp_variant_transforms.transforms import bigquery_to_variant from gcp_variant_transforms.transforms import combine_sample_ids @@ -342,19 +341,17 @@ def _write_vcf_header_with_sample_names(sample_names, meta-information. file_path: The location where the VCF headers is saved. """ - # pylint: disable=redefined-outer-name,reimported - from apache_beam.io import filesystems - from gcp_variant_transforms.libs import vcf_header_parser metadata_header_lines = vcf_header_parser.get_metadata_header_lines( representative_header_file) with filesystems.FileSystems.create(file_path) as file_to_write: - file_to_write.write(''.join(metadata_header_lines)) + file_to_write.write(str(''.join(metadata_header_lines)).encode('utf-8')) file_to_write.write( - str('\t'.join(vcf_fixed_columns + sample_names))) - file_to_write.write('\n') + str('\t'.join(vcf_fixed_columns + sample_names)).encode('utf-8')) + file_to_write.write(b'\n') -def _get_file_path_and_sorted_variants((file_name, variants), file_path_prefix): +def _get_file_path_and_sorted_variants(file_name_and_variants, + file_path_prefix): # type: (Tuple[str, List], str) -> Iterable[Tuple[str, List]] """Returns the file path and the sorted variants. @@ -366,8 +363,7 @@ def _get_file_path_and_sorted_variants((file_name, variants), file_path_prefix): pipeline. The files written will begin with this prefix, followed by the `file_name`. """ - # pylint: disable=redefined-outer-name,reimported - from apache_beam.io import filesystems + (file_name, variants) = file_name_and_variants file_path = filesystems.FileSystems.join(file_path_prefix, file_name) yield (file_path, sorted(variants)) diff --git a/gcp_variant_transforms/bq_to_vcf_test.py b/gcp_variant_transforms/bq_to_vcf_test.py index 4f5640d89..e818efe81 100644 --- a/gcp_variant_transforms/bq_to_vcf_test.py +++ b/gcp_variant_transforms/bq_to_vcf_test.py @@ -30,7 +30,8 @@ class BqToVcfTest(unittest.TestCase): """Test cases for the `bq_to_vcf` module.""" def _create_mock_args(self, **args): - return collections.namedtuple('MockArgs', args.keys())(*args.values()) + return collections.namedtuple( + 'MockArgs', list(args.keys()))(*list(args.values())) def test_write_vcf_data_header(self): lines = [ @@ -58,7 +59,7 @@ def test_write_vcf_data_header(self): '#CHROM\tPOS\tID\tREF\tALT\tSample 1\tSample 2\n' ] with filesystems.FileSystems.open(file_path) as f: - content = f.readlines() + content = [line.decode('utf-8') for line in f.readlines()] self.assertEqual(content, expected_content) def test_get_variant_query_no_region(self): diff --git a/gcp_variant_transforms/libs/annotation/annotation_parser.py b/gcp_variant_transforms/libs/annotation/annotation_parser.py index f526539cb..0a613ab6b 100644 --- a/gcp_variant_transforms/libs/annotation/annotation_parser.py +++ b/gcp_variant_transforms/libs/annotation/annotation_parser.py @@ -18,7 +18,6 @@ some helper methods that can be used in different contexts. """ -from __future__ import absolute_import import re @@ -64,7 +63,7 @@ class AnnotationAltNotFound(AnnotationParserException): pass -class Parser(object): +class Parser(): """The main class for parsing annotation fields of a single variant record. The expected usage is to pass information about one variant record, namely the @@ -298,10 +297,10 @@ def _find_alt_index_by_allele_num(self, annotation_map): alt_index + 1, 1, len(self._alt_list))) return alt_index except ValueError as e: - raise InvalidAlleleNumValue(e) + raise InvalidAlleleNumValue(e) from e -class AnnotationStrBuilder(object): +class AnnotationStrBuilder(): """The class for reconstructing annotation str.""" def __init__(self, annotation_id_to_annotation_names): @@ -367,7 +366,7 @@ def is_valid_annotation_id(self, key): names are given such that the annotation string can be reconstructed. """ return (self._annotation_id_to_annotation_names and - key in self._annotation_id_to_annotation_names.keys()) + key in list(self._annotation_id_to_annotation_names.keys())) def extract_annotation_list_with_alt(annotation_str): diff --git a/gcp_variant_transforms/libs/annotation/annotation_parser_test.py b/gcp_variant_transforms/libs/annotation/annotation_parser_test.py index 2913feaf0..cbbbd8a3b 100644 --- a/gcp_variant_transforms/libs/annotation/annotation_parser_test.py +++ b/gcp_variant_transforms/libs/annotation/annotation_parser_test.py @@ -18,7 +18,6 @@ through unit-testing of processed_variant module. """ -from __future__ import absolute_import import unittest @@ -40,7 +39,7 @@ def test_extract_annotation_names(self): def test_extract_annotation_names_error(self): annotation_str = 'some desc-Consequence-IMPACT-SYMBOL-Gene' - with self.assertRaisesRegexp(ValueError, 'Expected at least one.*'): + with self.assertRaisesRegex(ValueError, 'Expected at least one.*'): annotation_parser.extract_annotation_names(annotation_str) @@ -51,14 +50,14 @@ def test_reconstruct_annotation_str(self): str_builder = annotation_parser.AnnotationStrBuilder({ 'CSQ': ['allele', 'Consequence', 'AF', 'IMPACT'], 'CSQ_2': ['allele', 'Consequence', 'IMPACT']}) - annotation_maps = [{u'allele': u'G', - u'Consequence': u'upstream_gene_variant', - u'AF': u'', - u'IMPACT': u'MODIFIER'}, - {u'allele': u'G', - u'Consequence': u'upstream_gene_variant', - u'AF': u'0.1', - u'IMPACT': u''}] + annotation_maps = [{'allele': 'G', + 'Consequence': 'upstream_gene_variant', + 'AF': '', + 'IMPACT': 'MODIFIER'}, + {'allele': 'G', + 'Consequence': 'upstream_gene_variant', + 'AF': '0.1', + 'IMPACT': ''}] expected_annotation_strs = ['G|upstream_gene_variant||MODIFIER', 'G|upstream_gene_variant|0.1|'] @@ -69,13 +68,13 @@ def test_reconstruct_annotation_str(self): def test_reconstruct_annotation_str_missing_annotation_names(self): str_builder = annotation_parser.AnnotationStrBuilder(None) - annotation_maps = [{u'Consequence': u'upstream_gene_variant'}] + annotation_maps = [{'Consequence': 'upstream_gene_variant'}] with self.assertRaises(ValueError): list(str_builder.reconstruct_annotation_str('CSQ', annotation_maps)) str_builder = annotation_parser.AnnotationStrBuilder( {'CSQ2': ['Consequence', 'AF']}) - annotation_maps = [{u'Consequence': u'upstream_gene_variant'}] + annotation_maps = [{'Consequence': 'upstream_gene_variant'}] with self.assertRaises(ValueError): list(str_builder.reconstruct_annotation_str('CSQ', annotation_maps)) diff --git a/gcp_variant_transforms/libs/annotation/vep/file_metadata_stub.py b/gcp_variant_transforms/libs/annotation/vep/file_metadata_stub.py index 618734765..a53d8bc60 100644 --- a/gcp_variant_transforms/libs/annotation/vep/file_metadata_stub.py +++ b/gcp_variant_transforms/libs/annotation/vep/file_metadata_stub.py @@ -13,7 +13,7 @@ # limitations under the License. -class FileMetadataStub(object): +class FileMetadataStub(): """This is an object to imitate apache_beam.io.filesystem.FileMetadata.""" def __init__(self, path, size_in_bytes): diff --git a/gcp_variant_transforms/libs/annotation/vep/vep_runner.py b/gcp_variant_transforms/libs/annotation/vep/vep_runner.py index c0d79cb05..2bb948d18 100644 --- a/gcp_variant_transforms/libs/annotation/vep/vep_runner.py +++ b/gcp_variant_transforms/libs/annotation/vep/vep_runner.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import argparse # pylint: disable=unused-import import logging @@ -95,7 +94,7 @@ def create_runner(known_args, pipeline_args, input_pattern, watchdog_file, return runner -class VepRunner(object): +class VepRunner(): """A class for running vep through Pipelines API on a set of input files.""" _VEP_CACHE_BASE = ('gs://cloud-lifesciences/vep/' @@ -387,11 +386,11 @@ def run_on_all_files(self): def _call_pipelines_api(self, io_infos, output_log_path): # type: (vep_runner_util.SingleWorkerActions, str) -> str api_request = self._get_api_request_fixed_parts() - size_gb = io_infos.disk_size_bytes / (1 << 30) + size_gb = io_infos.disk_size_bytes // (1 << 30) api_request[_API_PIPELINE]['resources'][ 'virtualMachine']['disks'][0]['sizeGb'] = ( size_gb + _MINIMUM_DISK_SIZE_GB) - for input_file, output_file in io_infos.io_map.iteritems(): + for input_file, output_file in io_infos.io_map.items(): api_request[_API_PIPELINE][_API_ACTIONS].extend( self._create_actions(input_file, output_file)) api_request[_API_PIPELINE][_API_ACTIONS].append( diff --git a/gcp_variant_transforms/libs/annotation/vep/vep_runner_test.py b/gcp_variant_transforms/libs/annotation/vep/vep_runner_test.py index 8ca526d29..280b7e448 100644 --- a/gcp_variant_transforms/libs/annotation/vep/vep_runner_test.py +++ b/gcp_variant_transforms/libs/annotation/vep/vep_runner_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import unittest @@ -97,7 +96,7 @@ def test_instantiation(self): def test_instantiation_bad_pipeline_options(self): """This is to test object construction fails with incomplete arguments.""" - with self.assertRaisesRegexp(ValueError, '.*project.*'): + with self.assertRaisesRegex(ValueError, '.*project.*'): self._create_test_instance(pipeline_args=['no_arguments']) def test_make_vep_cache_path(self): @@ -133,7 +132,7 @@ def _validate_run_for_all_files(self): self._pipelines_spy.validate_calls([f[0] for f in _INPUT_FILES_WITH_SIZE]) def test_run_on_all_files(self): - num_workers = len(_INPUT_FILES_WITH_SIZE) / 2 + 1 + num_workers = len(_INPUT_FILES_WITH_SIZE) // 2 + 1 test_instance = self._create_test_instance( self._get_pipeline_args(num_workers)) with patch('apache_beam.io.filesystems.FileSystems', _MockFileSystems): @@ -162,7 +161,7 @@ def test_wait_until_done(self): test_instance = self._create_test_instance() with patch('apache_beam.io.filesystems.FileSystems', _MockFileSystems): test_instance.run_on_all_files() - with self.assertRaisesRegexp(AssertionError, '.*already.*running.*'): + with self.assertRaisesRegex(AssertionError, '.*already.*running.*'): # Since there are running operations, the next call raises an exception. test_instance.run_on_all_files() test_instance.wait_until_done() @@ -182,11 +181,11 @@ def test_wait_until_done_fail(self): test_instance = self._create_test_instance() with patch('apache_beam.io.filesystems.FileSystems', _MockFileSystems): test_instance.run_on_all_files() - with self.assertRaisesRegexp(RuntimeError, '.*failed.*retries.*'): + with self.assertRaisesRegex(RuntimeError, '.*failed.*retries.*'): test_instance.wait_until_done() -class PipelinesSpy(object): +class PipelinesSpy(): """A class to intercept calls to the run() function of Pipelines API.""" def __init__(self, mock_request): diff --git a/gcp_variant_transforms/libs/annotation/vep/vep_runner_util.py b/gcp_variant_transforms/libs/annotation/vep/vep_runner_util.py index 1f3009036..a49805b96 100644 --- a/gcp_variant_transforms/libs/annotation/vep/vep_runner_util.py +++ b/gcp_variant_transforms/libs/annotation/vep/vep_runner_util.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import -from __future__ import division import logging import math @@ -52,7 +50,7 @@ _MIN_NUM_OF_VARIANT = 50 * 1000 -class WorkerIOInfo(object): +class WorkerIOInfo(): """Holds information about input/output on a virtual machine. This is a pure data object and attributes can be accessed directly but the diff --git a/gcp_variant_transforms/libs/annotation/vep/vep_runner_util_test.py b/gcp_variant_transforms/libs/annotation/vep/vep_runner_util_test.py index fe0b829f8..6b7709f93 100644 --- a/gcp_variant_transforms/libs/annotation/vep/vep_runner_util_test.py +++ b/gcp_variant_transforms/libs/annotation/vep/vep_runner_util_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import unittest @@ -63,7 +62,7 @@ def test_disribute_files_on_workers_multiple(self): len(_INPUT_FILES_WITH_SIZE)) merged_dict = {} for actions_list in worker_actions_list: - for k, v in actions_list.io_map.iteritems(): + for k, v in actions_list.io_map.items(): merged_dict[k] = v self.assertDictEqual( merged_dict, diff --git a/gcp_variant_transforms/libs/avro_util.py b/gcp_variant_transforms/libs/avro_util.py index af33894f1..c72dcd856 100644 --- a/gcp_variant_transforms/libs/avro_util.py +++ b/gcp_variant_transforms/libs/avro_util.py @@ -23,7 +23,7 @@ _MAX_NUM_CONCURRENT_AVRO_LOAD_JOBS = 4 -class LoadAvro(object): +class LoadAvro(): """Loads AVRO files from Cloud Storage to already created BigQuery tables.""" def __init__(self, avro_root_path, # type: str @@ -118,7 +118,7 @@ def _monitor_load_jobs(self): # Waits until current jobs are done and then add remaining jobs one by one. while self._suffixes_to_load_jobs: time.sleep(60) - processed_suffixes = self._suffixes_to_load_jobs.keys() + processed_suffixes = list(self._suffixes_to_load_jobs.keys()) for suffix in processed_suffixes: load_job = self._suffixes_to_load_jobs.get(suffix) if load_job.done(): diff --git a/gcp_variant_transforms/libs/bigquery_row_generator.py b/gcp_variant_transforms/libs/bigquery_row_generator.py index 37f971797..47987854b 100644 --- a/gcp_variant_transforms/libs/bigquery_row_generator.py +++ b/gcp_variant_transforms/libs/bigquery_row_generator.py @@ -29,7 +29,7 @@ _BigQuerySchemaSanitizer = bigquery_sanitizer.SchemaSanitizer -class BigQueryRowGenerator(object): +class BigQueryRowGenerator(): """Base abstract class for BigQuery row generator. The base class provides the common functionalities when generating BigQuery @@ -109,7 +109,7 @@ def _get_call_record( is_empty = (not call.genotype or set(call.genotype) == {vcf_parser.MISSING_GENOTYPE_VALUE}) - for key, data in call.info.iteritems(): + for key, data in call.info.items(): if data is not None: field_name, field_data = self._get_bigquery_field_entry( key, data, schema_descriptor, @@ -143,13 +143,13 @@ def _get_variant_meta_record(self, variant, allow_incompatible_records): for alt in variant.alternate_data_list: alt_record = {bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT: alt.alternate_bases} - for key, data in alt.info.iteritems(): + for key, data in alt.info.items(): alt_record[_BigQuerySchemaSanitizer.get_sanitized_field_name(key)] = ( data if key in alt.annotation_field_names else self._bigquery_field_sanitizer.get_sanitized_field(data)) row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append(alt_record) # Add info. - for key, data in variant.non_alt_info.iteritems(): + for key, data in variant.non_alt_info.items(): if data is not None: field_name, field_data = self._get_bigquery_field_entry( key, data, self._schema_descriptor, allow_incompatible_records) @@ -243,8 +243,7 @@ def _get_call_record( allow_incompatible_records, # type: bool ): # type: (...) -> (Dict[str, Any], bool) - call_record, is_empty = super( - VariantCallRowGenerator, self)._get_call_record( + call_record, is_empty = super()._get_call_record( call, schema_descriptor, allow_incompatible_records) call_record.update({ bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID: @@ -254,8 +253,8 @@ def _get_call_record( def _get_base_row_from_variant(self, variant, allow_incompatible_records): # type: (processed_variant.ProcessedVariant, bool) -> Dict[str, Any] - row = super(VariantCallRowGenerator, self)._get_base_variant_record(variant) - meta = super(VariantCallRowGenerator, self)._get_variant_meta_record( + row = super()._get_base_variant_record(variant) + meta = super()._get_variant_meta_record( variant, allow_incompatible_records) row.update(meta) # Set calls to empty for now (will be filled later). diff --git a/gcp_variant_transforms/libs/bigquery_row_generator_test.py b/gcp_variant_transforms/libs/bigquery_row_generator_test.py index 05d2b7544..221e0eb37 100644 --- a/gcp_variant_transforms/libs/bigquery_row_generator_test.py +++ b/gcp_variant_transforms/libs/bigquery_row_generator_test.py @@ -140,35 +140,35 @@ def _get_table_schema(): def _get_big_query_row(): # type: (...) -> Dict[unicode, Any] """Returns one sample BigQuery row for testing.""" - return {unicode(ColumnKeyConstants.REFERENCE_NAME): unicode('chr19'), - unicode(ColumnKeyConstants.START_POSITION): 11, - unicode(ColumnKeyConstants.END_POSITION): 12, - unicode(ColumnKeyConstants.REFERENCE_BASES): 'C', - unicode(ColumnKeyConstants.NAMES): [unicode('rs1'), unicode('rs2')], - unicode(ColumnKeyConstants.QUALITY): 2, - unicode(ColumnKeyConstants.FILTER): [unicode('PASS')], - unicode(ColumnKeyConstants.CALLS): [ - {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( - unicode(hash_name('Sample1'))), - unicode(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], - unicode(ColumnKeyConstants.CALLS_PHASESET): unicode('*'), - unicode('GQ'): 20, unicode('FIR'): [10, 20]}, - {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( - unicode(hash_name('Sample2'))), - unicode(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], - unicode(ColumnKeyConstants.CALLS_PHASESET): None, - unicode('GQ'): 10, unicode('FB'): True} + return {str(ColumnKeyConstants.REFERENCE_NAME): str('chr19'), + str(ColumnKeyConstants.START_POSITION): 11, + str(ColumnKeyConstants.END_POSITION): 12, + str(ColumnKeyConstants.REFERENCE_BASES): 'C', + str(ColumnKeyConstants.NAMES): [str('rs1'), str('rs2')], + str(ColumnKeyConstants.QUALITY): 2, + str(ColumnKeyConstants.FILTER): [str('PASS')], + str(ColumnKeyConstants.CALLS): [ + {str(ColumnKeyConstants.CALLS_SAMPLE_ID): ( + str(hash_name('Sample1'))), + str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], + str(ColumnKeyConstants.CALLS_PHASESET): str('*'), + str('GQ'): 20, str('FIR'): [10, 20]}, + {str(ColumnKeyConstants.CALLS_SAMPLE_ID): ( + str(hash_name('Sample2'))), + str(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], + str(ColumnKeyConstants.CALLS_PHASESET): None, + str('GQ'): 10, str('FB'): True} ], - unicode(ColumnKeyConstants.ALTERNATE_BASES): [ - {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('A'), - unicode('IFR'): 1, - unicode('IFR2'): 0.2}, - {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('TT'), - unicode('IFR'): 0.2, - unicode('IFR2'): 0.3} + str(ColumnKeyConstants.ALTERNATE_BASES): [ + {str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('A'), + str('IFR'): 1, + str('IFR2'): 0.2}, + {str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('TT'), + str('IFR'): 0.2, + str('IFR2'): 0.3} ], - unicode('IS'): unicode('some data'), - unicode('ISR'): [unicode('data1'), unicode('data2')]} + str('IS'): str('some data'), + str('ISR'): [str('data1'), str('data2')]} class VariantCallRowGeneratorTest(unittest.TestCase): diff --git a/gcp_variant_transforms/libs/bigquery_sanitizer.py b/gcp_variant_transforms/libs/bigquery_sanitizer.py index 54a4a7656..fc589b262 100644 --- a/gcp_variant_transforms/libs/bigquery_sanitizer.py +++ b/gcp_variant_transforms/libs/bigquery_sanitizer.py @@ -31,7 +31,7 @@ _DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT = -2 ^ 31 -class SchemaSanitizer(object): +class SchemaSanitizer(): """Class to sanitize BigQuery schema according to BigQuery restrictions.""" @staticmethod @@ -63,7 +63,7 @@ def get_sanitized_field_name(field_name): return re.sub('[^a-zA-Z0-9_]', '_', field_name) -class FieldSanitizer(object): +class FieldSanitizer(): """Class to sanitize field values according to BigQuery restrictions.""" def __init__(self, null_numeric_value_replacement): @@ -107,7 +107,7 @@ def get_sanitized_field(self, field): """ if not field: return field - if isinstance(field, basestring): + if isinstance(field, (str, bytes)): return self._get_sanitized_string(field) elif isinstance(field, float): return self._get_sanitized_float(field) @@ -142,11 +142,11 @@ def _get_sanitized_list(self, input_list): for i in input_list: if i is None: continue - if isinstance(i, basestring): + if isinstance(i, str): null_replacement_value = vcfio.MISSING_FIELD_VALUE elif isinstance(i, bool): null_replacement_value = False - elif isinstance(i, (int, long, float)): + elif isinstance(i, (int, float)): null_replacement_value = self._null_numeric_value_replacement else: raise ValueError('Unsupported value for input: %s' % str(i)) @@ -157,7 +157,7 @@ def _get_sanitized_list(self, input_list): for i in input_list: if i is None: i = null_replacement_value - elif isinstance(i, basestring): + elif isinstance(i, (str, bytes)): i = self._get_sanitized_string(i) elif isinstance(i, float): sanitized_float = self._get_sanitized_float(i) @@ -183,15 +183,15 @@ def _get_sanitized_float(self, input_float): return input_float def _get_sanitized_string(self, input_str): - # type: (str) -> unicode + # type: (Any) -> str """Returns a unicode as BigQuery API does not support UTF-8 strings.""" return _decode_utf8_string(input_str) def _decode_utf8_string(input_str): - # type: (str) -> unicode + # type: (Any) -> str try: - return (input_str if isinstance(input_str, unicode) + return (input_str if isinstance(input_str, str) else input_str.decode('utf-8')) - except UnicodeDecodeError: - raise ValueError('input_str is not UTF-8: %s ' % (input_str)) + except UnicodeDecodeError as e: + raise ValueError('input_str is not UTF-8: %s ' % (input_str)) from e diff --git a/gcp_variant_transforms/libs/bigquery_sanitizer_test.py b/gcp_variant_transforms/libs/bigquery_sanitizer_test.py index c53c2a524..2124358b7 100644 --- a/gcp_variant_transforms/libs/bigquery_sanitizer_test.py +++ b/gcp_variant_transforms/libs/bigquery_sanitizer_test.py @@ -25,9 +25,9 @@ class BigQuerySanitizerTest(unittest.TestCase): def test_decode_utf8_string(self): - self.assertEqual(u'BÑD', + self.assertEqual('BÑD', bigquery_sanitizer._decode_utf8_string('BÑD')) - self.assertEqual(u'BD', + self.assertEqual('BD', bigquery_sanitizer._decode_utf8_string('BD')) def test_get_sanitized_field_name(self): @@ -40,10 +40,10 @@ def test_get_sanitized_field_name(self): def test_get_sanitized_field(self): sanitizer = bigquery_sanitizer.FieldSanitizer(None) - self.assertEqual(u'valid', + self.assertEqual('valid', sanitizer.get_sanitized_field('valid')) self.assertRaises(ValueError, - sanitizer.get_sanitized_field, '\x81DUMMY') + sanitizer.get_sanitized_field, b'\x81DUMMY') self.assertEqual([1, 2], sanitizer.get_sanitized_field([1, 2])) self.assertEqual( diff --git a/gcp_variant_transforms/libs/bigquery_schema_descriptor.py b/gcp_variant_transforms/libs/bigquery_schema_descriptor.py index 53e172bb8..a4841c9c3 100644 --- a/gcp_variant_transforms/libs/bigquery_schema_descriptor.py +++ b/gcp_variant_transforms/libs/bigquery_schema_descriptor.py @@ -13,7 +13,6 @@ # limitations under the License. """A dict based description for BigQuery schema.""" -from __future__ import absolute_import from typing import NamedTuple from apache_beam.io.gcp.internal.clients import bigquery # pylint: disable=unused-import @@ -23,7 +22,7 @@ FieldDescriptor = NamedTuple('FieldDescriptor', [('type', str), ('mode', str)]) -class SchemaDescriptor(object): +class SchemaDescriptor(): """A dict based description for :class:`bigquery.TableSchema` object. This class provides APIs for checking if and how (e.g. type, mode) a field diff --git a/gcp_variant_transforms/libs/bigquery_schema_descriptor_test.py b/gcp_variant_transforms/libs/bigquery_schema_descriptor_test.py index b5a8b6c28..0ff9a62a4 100644 --- a/gcp_variant_transforms/libs/bigquery_schema_descriptor_test.py +++ b/gcp_variant_transforms/libs/bigquery_schema_descriptor_test.py @@ -15,8 +15,6 @@ """Tests for bigquery_schema_descriptor module.""" -from __future__ import absolute_import - import unittest from apache_beam.io.gcp.internal.clients import bigquery diff --git a/gcp_variant_transforms/libs/bigquery_util.py b/gcp_variant_transforms/libs/bigquery_util.py index 6ba2696b9..ea2581ea0 100644 --- a/gcp_variant_transforms/libs/bigquery_util.py +++ b/gcp_variant_transforms/libs/bigquery_util.py @@ -14,9 +14,7 @@ """Constants and simple utility functions related to BigQuery.""" -from concurrent.futures import TimeoutError import enum -import exceptions import logging import os import re @@ -40,7 +38,7 @@ BQ_NUM_RETRIES = 5 -class ColumnKeyConstants(object): +class ColumnKeyConstants(): """Constants for column names in the BigQuery schema.""" REFERENCE_NAME = 'reference_name' START_POSITION = 'start_position' @@ -57,7 +55,7 @@ class ColumnKeyConstants(object): CALLS_PHASESET = 'phaseset' -class TableFieldConstants(object): +class TableFieldConstants(): """Constants for field modes/types in the BigQuery schema.""" TYPE_STRING = 'STRING' TYPE_INTEGER = 'INTEGER' @@ -69,7 +67,7 @@ class TableFieldConstants(object): MODE_REPEATED = 'REPEATED' -class AvroConstants(object): +class AvroConstants(): """Constants that are relevant to Avro schema.""" TYPE = 'type' NAME = 'name' @@ -124,7 +122,7 @@ class _SupportedTableFieldType(enum.Enum): _BIG_QUERY_TYPE_TO_PYTHON_TYPE_MAP = { TableFieldConstants.TYPE_INTEGER: int, # Bigquery accepts unicode for strings. - TableFieldConstants.TYPE_STRING: unicode, + TableFieldConstants.TYPE_STRING: str, TableFieldConstants.TYPE_FLOAT: float, TableFieldConstants.TYPE_BOOLEAN: bool, } @@ -158,10 +156,9 @@ def raise_error_if_dataset_not_exists(client, project_id, dataset_id): except exceptions.HttpError as e: if e.status_code == 404: raise ValueError('Dataset %s:%s does not exist.' % - (project_id, dataset_id)) - else: - # For the rest of the errors, use BigQuery error message. - raise + (project_id, dataset_id)) from e + # For the rest of the errors, use BigQuery error message. + raise def table_exist(client, project_id, dataset_id, table_id): @@ -196,20 +193,18 @@ def table_empty(project_id, dataset_id, table_id): time.sleep(90) else: raise e + if results.total_rows == 1: + break + logging.error('Query did not returned expected # of rows: %s', query) + if num_retries < BQ_NUM_RETRIES: + num_retries += 1 + time.sleep(90) else: - if results.total_rows == 1: - break - else: - logging.error('Query did not returned expected # of rows: %s', query) - if num_retries < BQ_NUM_RETRIES: - num_retries += 1 - time.sleep(90) - else: - raise ValueError('Expected 1 row in query result, got {}'.format( - results.total_rows)) + raise ValueError('Expected 1 row in query result, got {}'.format( + results.total_rows)) row = list(results)[0] - col_names = row.keys() + col_names = list(row.keys()) if set(col_names) != {num_rows}: logging.error('Query `%s` did not return expected `%s` column.', query, num_rows) @@ -313,7 +308,7 @@ def update_bigquery_schema_on_append(schema_fields, output_table): table=existing_table, tableId=table_id)) except exceptions.HttpError as e: - raise RuntimeError('BigQuery schema update failed: %s' % str(e)) + raise RuntimeError('BigQuery schema update failed: %s' % str(e)) from e def _get_merged_field_schemas( @@ -342,7 +337,7 @@ def _get_merged_field_schemas( merged_field_schemas.append(field_schema) for field_schema in field_schemas_2: - if field_schema.name not in existing_fields.keys(): + if field_schema.name not in list(existing_fields.keys()): merged_field_schemas.append(field_schema) else: existing_field_schema = existing_fields.get(field_schema.name) diff --git a/gcp_variant_transforms/libs/bigquery_util_test.py b/gcp_variant_transforms/libs/bigquery_util_test.py index b41beb40c..9fba983dc 100644 --- a/gcp_variant_transforms/libs/bigquery_util_test.py +++ b/gcp_variant_transforms/libs/bigquery_util_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import unittest from apitools.base.py import exceptions @@ -48,7 +47,7 @@ def test_get_python_from_bigquery_type(self): bigquery_util.TableFieldConstants.TYPE_INTEGER)) self.assertEqual(float, bigquery_util.get_python_type_from_bigquery_type( bigquery_util.TableFieldConstants.TYPE_FLOAT)) - self.assertEqual(unicode, + self.assertEqual(str, bigquery_util.get_python_type_from_bigquery_type( bigquery_util.TableFieldConstants.TYPE_STRING)) self.assertEqual(bool, bigquery_util.get_python_type_from_bigquery_type( diff --git a/gcp_variant_transforms/libs/bigquery_vcf_data_converter.py b/gcp_variant_transforms/libs/bigquery_vcf_data_converter.py index 2c823e713..608647340 100644 --- a/gcp_variant_transforms/libs/bigquery_vcf_data_converter.py +++ b/gcp_variant_transforms/libs/bigquery_vcf_data_converter.py @@ -14,8 +14,6 @@ """Converts BigQuery row to variant.""" -from __future__ import absolute_import -from __future__ import division from typing import Any, Dict, List # pylint: disable=unused-import @@ -44,7 +42,7 @@ ] -class VariantGenerator(object): +class VariantGenerator(): """Class to generate variant from one BigQuery row.""" def __init__(self, annotation_id_to_annotation_names=None): @@ -87,11 +85,11 @@ def _get_alternate_bases(self, alternate_base_records): def _get_variant_info(self, row): # type: (Dict[str, Any]) -> Dict[str, Any] info = {} - for key, value in row.iteritems(): + for key, value in row.items(): if key not in RESERVED_BQ_COLUMNS and not self._is_null_or_empty(value): info.update({key: value}) for alt_base in row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES]: - for key, value in alt_base.iteritems(): + for key, value in alt_base.items(): if (key != bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT and not self._is_null_or_empty(value)): if key not in info: @@ -109,7 +107,7 @@ def _get_variant_calls(self, variant_call_records): variant_calls = [] for call_record in variant_call_records: info = {} - for key, value in call_record.iteritems(): + for key, value in call_record.items(): if (key not in RESERVED_VARIANT_CALL_COLUMNS and not self._is_null_or_empty(value)): info.update({key: value}) diff --git a/gcp_variant_transforms/libs/bigquery_vcf_data_converter_test.py b/gcp_variant_transforms/libs/bigquery_vcf_data_converter_test.py index 7e8253931..64e7e71dc 100644 --- a/gcp_variant_transforms/libs/bigquery_vcf_data_converter_test.py +++ b/gcp_variant_transforms/libs/bigquery_vcf_data_converter_test.py @@ -14,7 +14,6 @@ """Tests for `bigquery_vcf_data_converter` module.""" -from __future__ import absolute_import import unittest from typing import Dict # pylint: disable=unused-import @@ -146,35 +145,35 @@ def _get_table_schema(): def _get_big_query_row(): # type: (None) -> Dict[unicode, Any] """Returns one sample BigQuery row for testing.""" - return {unicode(ColumnKeyConstants.REFERENCE_NAME): unicode('chr19'), - unicode(ColumnKeyConstants.START_POSITION): 11, - unicode(ColumnKeyConstants.END_POSITION): 12, - unicode(ColumnKeyConstants.REFERENCE_BASES): 'C', - unicode(ColumnKeyConstants.NAMES): [unicode('rs1'), unicode('rs2')], - unicode(ColumnKeyConstants.QUALITY): 2, - unicode(ColumnKeyConstants.FILTER): [unicode('PASS')], - unicode(ColumnKeyConstants.CALLS): [ - {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( + return {str(ColumnKeyConstants.REFERENCE_NAME): str('chr19'), + str(ColumnKeyConstants.START_POSITION): 11, + str(ColumnKeyConstants.END_POSITION): 12, + str(ColumnKeyConstants.REFERENCE_BASES): 'C', + str(ColumnKeyConstants.NAMES): [str('rs1'), str('rs2')], + str(ColumnKeyConstants.QUALITY): 2, + str(ColumnKeyConstants.FILTER): [str('PASS')], + str(ColumnKeyConstants.CALLS): [ + {str(ColumnKeyConstants.CALLS_SAMPLE_ID): ( hash_name('Sample1')), - unicode(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], - unicode(ColumnKeyConstants.CALLS_PHASESET): unicode('*'), - unicode('GQ'): 20, unicode('FIR'): [10, 20]}, - {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( + str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], + str(ColumnKeyConstants.CALLS_PHASESET): str('*'), + str('GQ'): 20, str('FIR'): [10, 20]}, + {str(ColumnKeyConstants.CALLS_SAMPLE_ID): ( hash_name('Sample2')), - unicode(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], - unicode(ColumnKeyConstants.CALLS_PHASESET): None, - unicode('GQ'): 10, unicode('FB'): True} + str(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], + str(ColumnKeyConstants.CALLS_PHASESET): None, + str('GQ'): 10, str('FB'): True} ], - unicode(ColumnKeyConstants.ALTERNATE_BASES): [ - {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('A'), - unicode('IFR'): 1, - unicode('IFR2'): 0.2}, - {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('TT'), - unicode('IFR'): 0.2, - unicode('IFR2'): 0.3} + str(ColumnKeyConstants.ALTERNATE_BASES): [ + {str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('A'), + str('IFR'): 1, + str('IFR2'): 0.2}, + {str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('TT'), + str('IFR'): 0.2, + str('IFR2'): 0.3} ], - unicode('IS'): unicode('some data'), - unicode('ISR'): [unicode('data1'), unicode('data2')]} + str('IS'): str('some data'), + str('ISR'): [str('data1'), str('data2')]} class _DummyVariantMergeStrategy(variant_merge_strategy.VariantMergeStrategy): @@ -218,35 +217,35 @@ def test_get_variant_info_annotation(self): 'CSQ': ['allele', 'Consequence', 'AF', 'IMPACT'] }) row = { - unicode(ColumnKeyConstants.ALTERNATE_BASES): [ + str(ColumnKeyConstants.ALTERNATE_BASES): [ { - unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): u'G', - unicode('CSQ'): [ - {u'allele': 'G', - u'Consequence': u'upstream_gene_variant', - u'AF': u'', - u'IMPACT': u'MODIFIER'}, - {u'allele': 'G', - u'Consequence': u'upstream_gene_variant', - u'AF': u'0.1', - u'IMPACT': u''}] + str(ColumnKeyConstants.ALTERNATE_BASES_ALT): 'G', + str('CSQ'): [ + {'allele': 'G', + 'Consequence': 'upstream_gene_variant', + 'AF': '', + 'IMPACT': 'MODIFIER'}, + {'allele': 'G', + 'Consequence': 'upstream_gene_variant', + 'AF': '0.1', + 'IMPACT': ''}] }, { - unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): u'T', - unicode('CSQ'): [ - {u'allele': 'T', - u'Consequence': u'', - u'AF': u'', - u'IMPACT': u'MODIFIER'}, - {u'allele': 'T', - u'Consequence': u'upstream_gene_variant', - u'AF': u'0.6', - u'IMPACT': u''}] + str(ColumnKeyConstants.ALTERNATE_BASES_ALT): 'T', + str('CSQ'): [ + {'allele': 'T', + 'Consequence': '', + 'AF': '', + 'IMPACT': 'MODIFIER'}, + {'allele': 'T', + 'Consequence': 'upstream_gene_variant', + 'AF': '0.6', + 'IMPACT': ''}] }, { - unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): u'TT', - unicode('CSQ'): [] + str(ColumnKeyConstants.ALTERNATE_BASES_ALT): 'TT', + str('CSQ'): [] } ] } diff --git a/gcp_variant_transforms/libs/genomic_region_parser.py b/gcp_variant_transforms/libs/genomic_region_parser.py index 4b06ca025..38544f56e 100644 --- a/gcp_variant_transforms/libs/genomic_region_parser.py +++ b/gcp_variant_transforms/libs/genomic_region_parser.py @@ -14,7 +14,6 @@ """Function for parsing genomic region.""" -from __future__ import absolute_import import re import sys @@ -31,8 +30,8 @@ def parse_comma_sep_int(int_str): # type: (str) -> int try: int_value = int(int_str.replace(',', '')) - except: - raise ValueError('Given value is not integer: {}'.format(int_str)) + except Exception as e: + raise ValueError('Given value is not integer: {}'.format(int_str)) from e return int_value def parse_genomic_region(genomic_region): diff --git a/gcp_variant_transforms/libs/genomic_region_parser_test.py b/gcp_variant_transforms/libs/genomic_region_parser_test.py index 8f1c2686e..90c02189a 100644 --- a/gcp_variant_transforms/libs/genomic_region_parser_test.py +++ b/gcp_variant_transforms/libs/genomic_region_parser_test.py @@ -14,7 +14,6 @@ """Unit tests for `genomic_region_parser` module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/libs/hashing_util.py b/gcp_variant_transforms/libs/hashing_util.py index ea67ebf0b..5d2ef528c 100644 --- a/gcp_variant_transforms/libs/hashing_util.py +++ b/gcp_variant_transforms/libs/hashing_util.py @@ -22,7 +22,7 @@ import farmhash -def _generate_unsigned_hash_code(strings, max_hash_value=sys.maxint): +def _generate_unsigned_hash_code(strings, max_hash_value=sys.maxsize): # type: (List[str], int) -> int """Generates a forever-fixed hash code for `strings`. diff --git a/gcp_variant_transforms/libs/header_merger.py b/gcp_variant_transforms/libs/header_merger.py index db3e58989..64e88f669 100644 --- a/gcp_variant_transforms/libs/header_merger.py +++ b/gcp_variant_transforms/libs/header_merger.py @@ -20,7 +20,7 @@ from gcp_variant_transforms.beam_io import vcf_header_io from gcp_variant_transforms.libs import vcf_field_conflict_resolver #pylint: disable=unused-import -class HeaderMerger(object): +class HeaderMerger(): """Class for merging two :class:`VcfHeader`s.""" def __init__(self, resolver): @@ -70,16 +70,16 @@ def _merge_header_fields( ValueError: If the header fields are incompatible (e.g. same key with different types or numbers). """ - for second_key, second_value in second.iteritems(): + for second_key, second_value in second.items(): if second_key not in first: first[second_key] = second_value continue first_value = first[second_key] - if first_value.keys() != second_value.keys(): + if list(first_value.keys()) != list(second_value.keys()): raise ValueError('Incompatible header fields: {}, {}'.format( first_value, second_value)) merged_value = OrderedDict() - for first_field_key, first_field_value in first_value.iteritems(): + for first_field_key, first_field_value in first_value.items(): second_field_value = second_value[first_field_key] try: resolution_field_value = self._resolver.resolve_attribute_conflict( @@ -90,6 +90,6 @@ def _merge_header_fields( except ValueError as e: raise ValueError('Incompatible number or types in header fields:' '{}, {} \n. Error: {}'.format( - first_value, second_value, str(e))) + first_value, second_value, str(e))) from e first[second_key] = merged_value diff --git a/gcp_variant_transforms/libs/header_merger_test.py b/gcp_variant_transforms/libs/header_merger_test.py index e34224426..f0dad7dd0 100644 --- a/gcp_variant_transforms/libs/header_merger_test.py +++ b/gcp_variant_transforms/libs/header_merger_test.py @@ -64,10 +64,10 @@ def test_merge_header_with_empty_one(self): merger.merge(header_1, header_2) merger.merge(header_2, header_1) - self.assertItemsEqual(header_1.infos.keys(), ['NS', 'AF']) - self.assertItemsEqual(header_1.formats.keys(), ['GT', 'GQ']) - self.assertItemsEqual(header_2.infos.keys(), ['NS', 'AF']) - self.assertItemsEqual(header_2.formats.keys(), ['GT', 'GQ']) + self.assertCountEqual(list(header_1.infos.keys()), ['NS', 'AF']) + self.assertCountEqual(list(header_1.formats.keys()), ['GT', 'GQ']) + self.assertCountEqual(list(header_2.infos.keys()), ['NS', 'AF']) + self.assertCountEqual(list(header_2.formats.keys()), ['GT', 'GQ']) def test_merge_two_headers(self): main_header = self._get_header_from_lines(FILE_1_LINES) @@ -76,8 +76,8 @@ def test_merge_two_headers(self): merger = self._get_header_merger() merger.merge(main_header, secondary_header) - self.assertItemsEqual(main_header.infos.keys(), ['NS', 'AF', 'NS2']) - self.assertItemsEqual(main_header.formats.keys(), ['GT', 'GQ', 'GQ2']) + self.assertCountEqual(list(main_header.infos.keys()), ['NS', 'AF', 'NS2']) + self.assertCountEqual(list(main_header.formats.keys()), ['GT', 'GQ', 'GQ2']) def test_merge_two_type_conflicting_but_resolvable_headers(self): # These two headers have type conflict (Integer vs Float), however pipeline @@ -98,8 +98,8 @@ def test_merge_two_type_conflicting_but_resolvable_headers(self): merger.merge(main_header, secondary_header) - self.assertItemsEqual(main_header.infos.keys(), ['NS']) - self.assertItemsEqual(main_header.infos['NS'], + self.assertCountEqual(list(main_header.infos.keys()), ['NS']) + self.assertCountEqual(main_header.infos['NS'], OrderedDict([('id', 'NS'), ('num', 1), ('type', 'Float'), @@ -126,8 +126,8 @@ def test_merge_two_num_conflicting_but_resolvable_headers_1(self): merger.merge(main_header, secondary_header) - self.assertItemsEqual(main_header.infos.keys(), ['NS']) - self.assertItemsEqual(main_header.infos['NS'], + self.assertCountEqual(list(main_header.infos.keys()), ['NS']) + self.assertCountEqual(main_header.infos['NS'], OrderedDict([('id', 'NS'), ('num', '.'), ('type', 'Integer'), @@ -154,8 +154,8 @@ def test_merge_two_num_conflicting_but_resolvable_headers_2(self): merger.merge(main_header, secondary_header) - self.assertItemsEqual(main_header.infos.keys(), ['NS']) - self.assertItemsEqual(main_header.infos['NS'], + self.assertCountEqual(list(main_header.infos.keys()), ['NS']) + self.assertCountEqual(main_header.infos['NS'], OrderedDict([('id', 'NS'), ('num', '.'), ('type', 'Integer'), diff --git a/gcp_variant_transforms/libs/infer_headers_util.py b/gcp_variant_transforms/libs/infer_headers_util.py index fd3d4164f..26cf5c787 100644 --- a/gcp_variant_transforms/libs/infer_headers_util.py +++ b/gcp_variant_transforms/libs/infer_headers_util.py @@ -14,7 +14,7 @@ """A helper module for header inference operations.""" -from __future__ import absolute_import + import logging from typing import Any, Dict, List, Optional, Union # pylint: disable=unused-import @@ -118,7 +118,7 @@ def infer_format_fields( ) updated_formats = {} for call in variant.calls: - for format_key, format_value in call.info.iteritems(): + for format_key, format_value in call.info.items(): if format_key not in formats: logging.warning('Undefined FORMAT field "%s" in variant "%s"', format_key, str(variant)) @@ -342,7 +342,7 @@ def _infer_non_annotation_info_fields( the field values. defined_headers: header fields defined in header section of VCF files. """ - for info_field_key, info_field_value in variant.info.iteritems(): + for info_field_key, info_field_value in variant.info.items(): if not defined_headers or info_field_key not in defined_headers.infos: if info_field_key in infos: raise ValueError( @@ -413,7 +413,7 @@ def _check_annotation_lists_lengths(names, values): annotation_values = [annotation_parser.extract_annotation_list_with_alt( annotation)[1:] for annotation in variant.info[field]] _check_annotation_lists_lengths(annotation_names, annotation_values) - annotation_values = zip(*annotation_values) + annotation_values = list(zip(*annotation_values)) for name, values in zip(annotation_names, annotation_values): variant_merged_type = '.' for v in values: diff --git a/gcp_variant_transforms/libs/infer_headers_util_test.py b/gcp_variant_transforms/libs/infer_headers_util_test.py index ff635e177..d51f1cbc7 100644 --- a/gcp_variant_transforms/libs/infer_headers_util_test.py +++ b/gcp_variant_transforms/libs/infer_headers_util_test.py @@ -14,7 +14,6 @@ """Tests for infer_headers_util module.""" -from __future__ import absolute_import from collections import OrderedDict import unittest diff --git a/gcp_variant_transforms/libs/metrics_util.py b/gcp_variant_transforms/libs/metrics_util.py index 57d1b61de..eb913d0fc 100644 --- a/gcp_variant_transforms/libs/metrics_util.py +++ b/gcp_variant_transforms/libs/metrics_util.py @@ -29,7 +29,6 @@ """ -from __future__ import absolute_import import logging @@ -45,7 +44,7 @@ _COUNTERS = 'counters' -class CounterInterface(object): +class CounterInterface(): """The interface of counter objects""" def inc(self, n=1): @@ -76,7 +75,7 @@ def inc(self, n=1): self._counter.inc(n) -class CounterFactoryInterface(object): +class CounterFactoryInterface(): """The interface for counter factories.""" def create_counter(self, counter_name): diff --git a/gcp_variant_transforms/libs/metrics_util_test.py b/gcp_variant_transforms/libs/metrics_util_test.py index 07700f3d1..2095986c3 100644 --- a/gcp_variant_transforms/libs/metrics_util_test.py +++ b/gcp_variant_transforms/libs/metrics_util_test.py @@ -14,7 +14,6 @@ """Unit tests for metrics_util module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/libs/partitioning.py b/gcp_variant_transforms/libs/partitioning.py index 71f50d633..ca81a5cf8 100644 --- a/gcp_variant_transforms/libs/partitioning.py +++ b/gcp_variant_transforms/libs/partitioning.py @@ -14,7 +14,6 @@ """Utilities to create integer range partitioned BigQuery tables.""" -from concurrent.futures import TimeoutError import json import logging import math @@ -56,7 +55,7 @@ '--clustering_fields=start_position,end_position ' '{FULL_TABLE_ID} {SCHEMA_FILE_PATH}') -class FlattenCallColumn(object): +class FlattenCallColumn(): """Flattens call column to convert variant opt tables to sample opt tables.""" def __init__(self, base_table_id, suffixes, append): @@ -115,7 +114,7 @@ def _run_query(self, query): break result = [] for i in iterator: - result.append(str(i.values()[0])) + result.append(str(list(i.values())[0])) return result def _get_column_names(self): diff --git a/gcp_variant_transforms/libs/preprocess_reporter.py b/gcp_variant_transforms/libs/preprocess_reporter.py index 7870647d4..a6ba1e5f3 100644 --- a/gcp_variant_transforms/libs/preprocess_reporter.py +++ b/gcp_variant_transforms/libs/preprocess_reporter.py @@ -61,14 +61,14 @@ _DELIMITER = '\t' -class _InconsistencyType(object): +class _InconsistencyType(): """Inconsistency types that included in the report.""" HEADER_CONFLICTS = 'Header Conflicts' INFERRED_HEADERS = 'Inferred Headers' MALFORMED_RECORDS = 'Malformed Records' -class _HeaderLine(object): +class _HeaderLine(): """Header lines for each error type.""" CONFLICTS_HEADER = 'ID\tCategory\tConflicts\tFile Paths\tProposed Resolution' INFERRED_FIELD_HEADER = 'ID\tCategory\tProposed Resolution' @@ -115,7 +115,7 @@ def _extract_conflicts( which maps `Definition` to a list of file names. """ # len(v) > 1 means there are conflicting definitions for this field. - return dict([(k, v) for k, v in definitions.items() if len(v) > 1]) + return {k:v for k, v in definitions.items() if len(v) > 1} def _append_conflicting_headers_to_report( @@ -206,9 +206,13 @@ def _generate_conflicting_headers_lines( num=1 type=Integer file2 """ content_lines = [] + # First element for conflict may be string or integer so such list cannot be + # sorted in python 3. Convert all nums to strings, and make sure to sort by + # secondary field (type) as well, for determenistic results. for field_id in sorted(conflicts.keys()): first_item = True - for definition in sorted(conflicts.get(field_id).keys()): + for definition in sorted(conflicts.get(field_id).keys(), + key=lambda x: (str(x[0]), x[1])): sorted_file_names = sorted(conflicts.get(field_id).get(definition)) if first_item: row = [field_id, @@ -284,10 +288,10 @@ def _append_to_report(file_to_write, error_type, header, contents): `file_to_write` sequentially. """ if not contents: - file_to_write.write('No ' + error_type + ' Found.\n') + file_to_write.write(('No ' + error_type + ' Found.\n').encode('utf-8')) else: - file_to_write.write(error_type + '\n') - file_to_write.write(header + '\n') + file_to_write.write((error_type + '\n').encode('utf-8')) + file_to_write.write((header + '\n').encode('utf-8')) for content in contents: - file_to_write.write(content + '\n') - file_to_write.write('\n') + file_to_write.write((content + '\n').encode('utf-8')) + file_to_write.write(b'\n') diff --git a/gcp_variant_transforms/libs/preprocess_reporter_test.py b/gcp_variant_transforms/libs/preprocess_reporter_test.py index 069ddb5c1..30cede877 100644 --- a/gcp_variant_transforms/libs/preprocess_reporter_test.py +++ b/gcp_variant_transforms/libs/preprocess_reporter_test.py @@ -51,7 +51,7 @@ def _generate_report_and_assert_contents_equal( inferred_headers, malformed_records) with FileSystems.open(file_path) as f: - reader = f.readlines() + reader = [b.decode('utf-8') for b in f.readlines()] self.assertEqual(reader, expected_content) def test_report_no_conflicts(self): diff --git a/gcp_variant_transforms/libs/processed_variant.py b/gcp_variant_transforms/libs/processed_variant.py index 1a8f668ef..5e7fceed0 100644 --- a/gcp_variant_transforms/libs/processed_variant.py +++ b/gcp_variant_transforms/libs/processed_variant.py @@ -20,7 +20,6 @@ functions are "private". """ -from __future__ import absolute_import import enum import logging @@ -61,7 +60,7 @@ class _CounterEnum(enum.Enum): ALLELE_NUM_INCORRECT = 'allele_num_incorrect' -class ProcessedVariant(object): +class ProcessedVariant(): """A wrapper around the ``Variant`` class with extra functionality. Given header file information, this can parse INFO fields that need to be @@ -146,7 +145,7 @@ def alternate_data_list(self): return self._alternate_datas -class AlternateBaseData(object): +class AlternateBaseData(): """This is to keep all information for a single alternate-bases.""" def __init__(self, alt_bases): @@ -185,7 +184,7 @@ def annotation_field_names(self): return self._annotation_field_names -class ProcessedVariantFactory(object): +class ProcessedVariantFactory(): """Factory class for creating `ProcessedVariant` instances. This is the only right way for creating ProcessedVariants in production code. @@ -255,7 +254,7 @@ def create_processed_variant(self, variant): """ proc_var = ProcessedVariant(variant) self._variant_counter.inc() - for key, variant_info_data in variant.info.iteritems(): + for key, variant_info_data in variant.info.items(): if key in self._annotation_field_set: self._annotation_processor.add_annotation_data( proc_var, key, variant_info_data) @@ -302,7 +301,7 @@ def create_alt_bases_field_schema(self): mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Alternate base.')) if self._split_alternate_allele_info_fields: - for key, field in self._header_fields.infos.iteritems(): + for key, field in self._header_fields.infos.items(): if self._is_num_a(field[_HeaderKeyConstants.NUM]): alternate_bases_record.fields.append(bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name(key), @@ -392,7 +391,7 @@ def _is_num_a(self, field_value): return field_value == _FIELD_COUNT_ALTERNATE_ALLELE -class _AnnotationProcessor(object): +class _AnnotationProcessor(): """This is for handling all annotation related logic for variants.""" @@ -475,7 +474,7 @@ def add_annotation_data(self, proc_var, annotation_field_name, data): for annotation_str in data: try: ind, annotation_map = parser.parse_and_match_alt(annotation_str) - for name, value in annotation_map.iteritems(): + for name, value in annotation_map.items(): if name == annotation_parser.ANNOTATION_ALT: continue type_key = infer_headers_util.get_inferred_annotation_type_header_key( diff --git a/gcp_variant_transforms/libs/processed_variant_test.py b/gcp_variant_transforms/libs/processed_variant_test.py index 29c2fb35d..e73bfd23f 100644 --- a/gcp_variant_transforms/libs/processed_variant_test.py +++ b/gcp_variant_transforms/libs/processed_variant_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import + from typing import Dict # pylint: disable=unused-import from collections import OrderedDict @@ -107,7 +107,7 @@ def test_create_processed_variant_move_alt_info(self): alt2 = processed_variant.AlternateBaseData('TT') alt2._info = {'A2': 'data2'} self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('A2')) + self.assertFalse('A2' in proc_var.non_alt_info) def test_create_processed_variant_move_alt_info_extra_values(self): header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) @@ -133,7 +133,7 @@ def test_create_processed_variant_move_alt_info_extra_values(self): alt2 = processed_variant.AlternateBaseData('TT') alt2._info = {'A2': 'data2'} self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('A2')) + self.assertFalse('A2' in proc_var.non_alt_info) def test_create_processed_variant_move_alt_info_insufficient_values(self): header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) @@ -158,7 +158,7 @@ def test_create_processed_variant_move_alt_info_insufficient_values(self): alt1._info = {'A2': 'data1'} alt2 = processed_variant.AlternateBaseData('TT') self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('A2')) + self.assertFalse('A2' in proc_var.non_alt_info) def _get_sample_variant_and_header_with_csq(self, additional_infos=None): """Provides a simple `Variant` and `VcfHeader` with info fields @@ -211,8 +211,8 @@ def test_create_processed_variant_move_alt_info_and_annotation(self): 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('A2')) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('A2' in proc_var.non_alt_info) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -252,8 +252,8 @@ def test_create_processed_variant_mismatched_annotation_alt(self): 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('A2')) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('A2' in proc_var.non_alt_info) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -299,7 +299,7 @@ def test_create_processed_variant_symbolic_and_breakend_annotation_alt(self): 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -343,7 +343,7 @@ def test_create_processed_variant_annotation_alt_prefix(self): 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -387,7 +387,7 @@ def test_create_processed_variant_annotation_alt_long_prefix(self): 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -425,7 +425,7 @@ def test_create_processed_variant_annotation_alt_prefix_but_ref(self): 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -466,7 +466,7 @@ def test_create_processed_variant_annotation_alt_minimal(self): 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ @@ -517,7 +517,7 @@ def test_create_processed_variant_annotation_alt_allele_num(self): 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) - self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ diff --git a/gcp_variant_transforms/libs/schema_converter.py b/gcp_variant_transforms/libs/schema_converter.py index 8fbad8d82..578568d96 100644 --- a/gcp_variant_transforms/libs/schema_converter.py +++ b/gcp_variant_transforms/libs/schema_converter.py @@ -14,7 +14,6 @@ """Handles the conversion between BigQuery/Avro schema and VCF header.""" -from __future__ import absolute_import from collections import OrderedDict import json @@ -147,7 +146,7 @@ def generate_schema_from_header_fields( description=('Phaseset of the call (if any). "*" is used in cases where ' 'the genotype is phased, but no phase set ("PS" in FORMAT) ' 'was specified.'))) - for key, field in header_fields.formats.iteritems(): + for key, field in header_fields.formats.items(): # GT and PS are already included in 'genotype' and 'phaseset' fields. if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY): continue @@ -165,7 +164,7 @@ def generate_schema_from_header_fields( info_keys = set() annotation_info_type_keys_set = set( proc_variant_factory.gen_annotation_info_type_keys()) - for key, field in header_fields.infos.iteritems(): + for key, field in header_fields.infos.items(): # END info is already included by modifying the end_position. Info type # fields exist only to indicate the type of corresponding annotation fields, # and should not be added to the schema. @@ -237,7 +236,7 @@ def _convert_field_to_avro_dict(field): return field_dict -def _convert_schema_to_avro_dict(schema): +def convert_schema_to_avro_dict(schema): # type: (bigquery.TableSchema) -> Dict fields_dict = {} # TODO(bashir2): Check if we need `namespace` and `name` at the top level. @@ -319,7 +318,7 @@ def convert_table_schema_to_json_avro_schema(schema): raise ValueError( 'Expected an instance of bigquery.TableSchema got {}'.format( type(schema))) - schema_dict = _convert_schema_to_avro_dict(schema) + schema_dict = convert_schema_to_avro_dict(schema) json_str = json.dumps(schema_dict) logging.info('The Avro schema is: %s', json_str) return json_str @@ -367,7 +366,7 @@ def generate_header_fields_from_schema(schema, allow_incompatible_schema=False): if (field.type not in bigquery_util.get_supported_bigquery_schema_types() or field.name in _NON_INFO_OR_FORMAT_CONSTANT_FIELDS): continue - elif field.name == bigquery_util.ColumnKeyConstants.CALLS: + if field.name == bigquery_util.ColumnKeyConstants.CALLS: _add_format_fields(field, formats, allow_incompatible_schema) else: _add_info_fields(field, infos, allow_incompatible_schema) @@ -380,8 +379,8 @@ def _add_format_fields(schema, formats, allow_incompatible_schema=False): for field in schema.fields: if field.name in _CONSTANT_CALL_FIELDS: continue - elif (field.name in vcf_reserved_fields.FORMAT_FIELDS.keys() and - not allow_incompatible_schema): + if (field.name in list(vcf_reserved_fields.FORMAT_FIELDS.keys()) and + not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.FORMAT_FIELDS.get(field.name) _validate_reserved_field(field, reserved_definition) formats.update({field.name: vcf_header_io.CreateFormatField( @@ -405,7 +404,7 @@ def _add_info_fields(field, infos, allow_incompatible_schema=False): _add_info_fields_from_alternate_bases(field, infos, allow_incompatible_schema) - elif (field.name in vcf_reserved_fields.INFO_FIELDS.keys() and + elif (field.name in list(vcf_reserved_fields.INFO_FIELDS.keys()) and not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(field.name) _validate_reserved_field(field, reserved_definition) @@ -440,13 +439,13 @@ def _add_info_fields_from_alternate_bases(schema, for field in schema.fields: if field.name in _CONSTANT_ALTERNATE_BASES_FIELDS: continue - elif field.type == bigquery_util.TableFieldConstants.TYPE_RECORD: + if field.type == bigquery_util.TableFieldConstants.TYPE_RECORD: infos.update({field.name: vcf_header_io.CreateInfoField( field.name, vcfio.MISSING_FIELD_VALUE, bigquery_util._VcfHeaderTypeConstants.STRING, _remove_special_characters(_get_annotation_description(field)))}) - elif (field.name in vcf_reserved_fields.INFO_FIELDS.keys() and + elif (field.name in list(vcf_reserved_fields.INFO_FIELDS.keys()) and not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(field.name) _validate_reserved_field_type(field, reserved_definition) diff --git a/gcp_variant_transforms/libs/schema_converter_test.py b/gcp_variant_transforms/libs/schema_converter_test.py index 9bb9c6a11..6fdbc9a1f 100644 --- a/gcp_variant_transforms/libs/schema_converter_test.py +++ b/gcp_variant_transforms/libs/schema_converter_test.py @@ -14,7 +14,6 @@ """Tests for `schema_converter` module.""" -from __future__ import absolute_import from collections import OrderedDict import json @@ -218,7 +217,7 @@ class ConvertTableSchemaToJsonAvroSchemaTest( """ def _validate_schema(self, expected_fields, actual_schema): - super(ConvertTableSchemaToJsonAvroSchemaTest, self)._validate_schema( + super()._validate_schema( expected_fields, actual_schema) avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema( @@ -237,8 +236,7 @@ class ConvertTableSchemaToJsonBQSchemaTest( """ def _validate_schema(self, expected_fields, actual_schema): - super(ConvertTableSchemaToJsonBQSchemaTest, self)._validate_schema( - expected_fields, actual_schema) + super()._validate_schema(expected_fields, actual_schema) json_schema = schema_converter.convert_table_schema_to_json_bq_schema( actual_schema) # Beam expects schema to be generated from dict with 'fields' item being @@ -655,7 +653,7 @@ def _get_fields_from_avro_type(field_or_schema, prefix): name = field_or_schema.name if name and name not in fields and name != 'TBD': fields.extend([prefix + field_or_schema.name]) - if field_or_schema.get_prop('fields'): + if 'fields' in field_or_schema.props: child_prefix = prefix if name != 'TBD': child_prefix = prefix + field_or_schema.name + '.' diff --git a/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py b/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py index 196d5c73e..ae0168520 100644 --- a/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py +++ b/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy.py @@ -14,7 +14,6 @@ """Variant merge stategy that can handle both Variants and Non-variants.""" -from __future__ import absolute_import import collections import copy @@ -92,8 +91,12 @@ def get_merged_variants(self, variants, key=None): if self._is_non_variant(v): non_variant_tree.addi(v.start, v.end, v) else: - group_key = next(self._move_to_calls.get_merge_keys(v)) - grouped_variants[group_key].append(v) + try: + group_key = next(self._move_to_calls.get_merge_keys(v)) + except StopIteration: + continue + else: + grouped_variants[group_key].append(v) non_variants = self._merge_non_variants(non_variant_tree) variants = self._merge_variants(grouped_variants) @@ -106,17 +109,21 @@ def get_merged_variants(self, variants, key=None): for v in variants: non_variant_interval = non_variant_tree.search(v.start, v.end) if non_variant_interval: - non_variant = next(iter(non_variant_interval)).data - v.calls.extend(non_variant.calls) - v.calls = sorted(v.calls) - self._update_splits(splits, v) + try: + non_variant = next(iter(non_variant_interval)).data + except StopIteration: + continue + else: + v.calls.extend(non_variant.calls) + v.calls = sorted(v.calls) + self._update_splits(splits, v) yield v for non_variant in self._split_non_variants(non_variant_tree, splits): yield non_variant def _merge_non_variants(self, t): - bounds = sorted(set([p for v in t for p in [v.begin, v.end]])) + bounds = sorted({p for v in t for p in [v.begin, v.end]}) intervals = [(bounds[i], bounds[i+1]) for i, _ in enumerate(bounds[:-1])] merged_non_variants = [] for start, end in intervals: @@ -133,8 +140,12 @@ def _merge_non_variants(self, t): for non_variant in overlapping_variants[1:]: merged_non_variant.names.extend(non_variant.names) merged_non_variant.filters.extend(non_variant.filters) - merged_non_variant.quality = min(merged_non_variant.quality, - non_variant.quality) + if (merged_non_variant.quality is not None and + non_variant.quality is not None): + merged_non_variant.quality = min(merged_non_variant.quality, + non_variant.quality) + elif merged_non_variant.quality is None: + merged_non_variant.quality = non_variant.quality merged_non_variant.calls.extend(non_variant.calls) merged_non_variant.names = sorted(set(merged_non_variant.names)) @@ -147,7 +158,7 @@ def _merge_non_variants(self, t): def _merge_variants(self, grouped_variants): merged_variants = [] - for merge_key, variants in grouped_variants.iteritems(): + for merge_key, variants in grouped_variants.items(): merged_variants.extend( self._move_to_calls.get_merged_variants(variants, merge_key)) return merged_variants @@ -172,7 +183,7 @@ def _split_non_variants(self, non_variant_tree, splits): onv = onv_interval.data if split.begin <= onv.start and split.end >= onv.end: continue - elif split.begin <= onv.start: + if split.begin <= onv.start: nv = copy.deepcopy(onv) nv.start = split.end non_variant_tree.addi(nv.start, nv.end, nv) diff --git a/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy_test.py b/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy_test.py index 2ae41276c..fe4a9a2ce 100644 --- a/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy_test.py +++ b/gcp_variant_transforms/libs/variant_merge/merge_with_non_variants_strategy_test.py @@ -14,7 +14,6 @@ """Tests for merge_with_nonvariants_strategy.""" -from __future__ import absolute_import import copy import unittest @@ -84,7 +83,7 @@ def test_get_merged_variants_no_custom_options(self): vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={'GQ': 20})], merged_variant.calls) - self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) + self.assertCountEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue( merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) @@ -130,7 +129,7 @@ def test_get_merged_variants_move_quality_and_filter_to_calls(self): ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10']})], merged_variant.calls) - self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) + self.assertCountEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue( merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) @@ -165,7 +164,7 @@ def test_get_merged_variants_move_info_to_calls(self): vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={'GQ': 20, 'A1': 'some data2'})], merged_variant.calls) - self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys()) + self.assertCountEqual(['A2', 'A3'], merged_variant.info.keys()) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], merged_variant.info['A3']) @@ -214,7 +213,7 @@ def test_get_merged_variants_move_everything_to_calls(self): ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10']})], merged_variant.calls) - self.assertEqual([], merged_variant.info.keys()) + self.assertEqual([], list(merged_variant.info.keys())) def test_get_snp_merge_keys(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( diff --git a/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy.py b/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy.py index 8e980aea3..5cb271c10 100644 --- a/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy.py +++ b/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy.py @@ -14,7 +14,6 @@ """Implements a variant merge stategy that moves fields to calls.""" -from __future__ import absolute_import import hashlib import re @@ -81,7 +80,7 @@ def move_data_to_calls(self, variant): if self._should_copy_quality_to_calls(): additional_call_info[ bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality - for info_key, info_value in variant.info.iteritems(): + for info_key, info_value in variant.info.items(): if self._should_move_info_key_to_calls(info_key): additional_call_info[info_key] = info_value for call in variant.calls: @@ -97,7 +96,7 @@ def move_data_to_merged(self, variant, merged_variant): merged_variant: The variant who will receive the info items of `variant` if specified. """ - for info_key, info_value in variant.info.iteritems(): + for info_key, info_value in variant.info.items(): if not self._should_move_info_key_to_calls(info_key): merged_variant.info[info_key] = info_value @@ -126,7 +125,11 @@ def get_merged_variants(self, variants, unused_key=None): merged_variant.names.extend(variant.names) merged_variant.filters.extend(variant.filters) - merged_variant.quality = max(merged_variant.quality, variant.quality) + if (merged_variant.quality is not None and + variant.quality is not None): + merged_variant.quality = max(merged_variant.quality, variant.quality) + elif merged_variant.quality is None: + merged_variant.quality = variant.quality self.move_data_to_calls(variant) self.move_data_to_merged(variant, merged_variant) @@ -158,7 +161,7 @@ def modify_bigquery_schema(self, schema, info_keys): if not calls_record: raise ValueError('calls record must exist in the schema.') - existing_calls_keys = set([field.name for field in calls_record.fields]) + existing_calls_keys = {field.name for field in calls_record.fields} updated_fields = [] for field in schema.fields: if (self._should_copy_filter_to_calls() and @@ -188,7 +191,7 @@ def modify_bigquery_schema(self, schema, info_keys): schema.fields = updated_fields def _get_hash(self, value): - return hashlib.md5(value).hexdigest() + return hashlib.md5(value.encode('utf-8')).hexdigest() def _should_move_info_key_to_calls(self, info_key): return bool(self._info_keys_to_move_to_calls_re and diff --git a/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy_test.py b/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy_test.py index cb286baf9..d6d925dc5 100644 --- a/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy_test.py +++ b/gcp_variant_transforms/libs/variant_merge/move_to_calls_strategy_test.py @@ -14,7 +14,6 @@ """Tests for move_to_calls_strategy.""" -from __future__ import absolute_import import unittest @@ -86,7 +85,7 @@ def test_get_merged_variants_no_custom_options(self): vcfio.VariantCall( sample_id=hash_name('Sample4'), genotype=[1, 0], info={'GQ': 20})], merged_variant.calls) - self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) + self.assertCountEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue( merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], @@ -134,7 +133,7 @@ def test_get_merged_variants_move_quality_and_filter_to_calls(self): ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10']})], merged_variant.calls) - self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) + self.assertCountEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue( merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], @@ -171,7 +170,7 @@ def test_get_merged_variants_move_info_to_calls(self): vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={'GQ': 20, 'A1': 'some data2'})], merged_variant.calls) - self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys()) + self.assertCountEqual(['A2', 'A3'], merged_variant.info.keys()) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], @@ -222,7 +221,7 @@ def test_get_merged_variants_move_everything_to_calls(self): ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10']})], merged_variant.calls) - self.assertEqual([], merged_variant.info.keys()) + self.assertEqual([], list(merged_variant.info.keys())) def test_get_merge_keys(self): strategy = move_to_calls_strategy.MoveToCallsStrategy(None, None, None) diff --git a/gcp_variant_transforms/libs/variant_merge/variant_merge_strategy.py b/gcp_variant_transforms/libs/variant_merge/variant_merge_strategy.py index 431bc8d54..0002bbdcf 100644 --- a/gcp_variant_transforms/libs/variant_merge/variant_merge_strategy.py +++ b/gcp_variant_transforms/libs/variant_merge/variant_merge_strategy.py @@ -23,7 +23,7 @@ __all__ = ['VariantMergeStrategy'] -class VariantMergeStrategy(object): +class VariantMergeStrategy(): """Interface for a variant merge strategy.""" def get_merged_variants(self, variants, key): @@ -59,4 +59,3 @@ def modify_bigquery_schema(self, schema, info_keys): ValueError: If updates to the schema are incompatible with the settings specified by the merge strategy. """ - pass diff --git a/gcp_variant_transforms/libs/variant_sharding.py b/gcp_variant_transforms/libs/variant_sharding.py index 9581b8443..396c675e1 100644 --- a/gcp_variant_transforms/libs/variant_sharding.py +++ b/gcp_variant_transforms/libs/variant_sharding.py @@ -24,7 +24,6 @@ available at gcp_variant_transforms/testing/data/misc/*.yaml """ -from __future__ import absolute_import from collections import defaultdict import re @@ -52,7 +51,7 @@ _PARTITION_RANGE_END = 'partition_range_end' -class _ChromosomeSharder(object): +class _ChromosomeSharder(): """Assigns shard indices to multiple regions inside a chromosome. This class logic is implemented using an interval tree, each region is @@ -93,7 +92,7 @@ def get_shard_index(self, pos=0): else: return _UNDEFINED_SHARD_INDEX -class VariantSharding(object): +class VariantSharding(): """Sharding variants based on their reference_name [and position].""" def __init__(self, config_file_path=None): @@ -140,7 +139,7 @@ def _validate_config_and_check_intervals(self, config_file_path): try: shards = yaml.load(f) except yaml.YAMLError as e: - raise ValueError('Invalid yaml file: {}'.format(str(e))) + raise ValueError('Invalid yaml file: {}'.format(str(e))) from e if len(shards) > _MAX_NUM_SHARDS: raise ValueError( 'There can be at most {} output tables but given config file ' @@ -212,9 +211,10 @@ def _validate_config_and_check_intervals(self, config_file_path): try: partition_range_end = genomic_region_parser.parse_comma_sep_int( partition_range_end) - except: - raise ValueError('Wrong sharding config file, each output table ' - 'needs an integer for partition_range_end > 0.') + except Exception as e: + raise ValueError( + 'Wrong sharding config file, each output table ' + 'needs an integer for partition_range_end > 0.') from e if partition_range_end <= 0: raise ValueError('Wrong sharding config file, each output table ' 'needs an integer for partition_range_end > 0.') @@ -230,7 +230,7 @@ def _parse_config(self, config_file_path): try: shards = yaml.load(f) except yaml.YAMLError as e: - raise ValueError('Invalid yaml file: {}'.format(str(e))) + raise ValueError('Invalid yaml file: {}'.format(str(e))) from e self._num_shards = len(shards) for shard_index in range(self._num_shards): diff --git a/gcp_variant_transforms/libs/variant_sharding_test.py b/gcp_variant_transforms/libs/variant_sharding_test.py index b58dff149..7618e524f 100644 --- a/gcp_variant_transforms/libs/variant_sharding_test.py +++ b/gcp_variant_transforms/libs/variant_sharding_test.py @@ -14,7 +14,6 @@ """Unit tests for variant_sharding module.""" -from __future__ import absolute_import import unittest @@ -131,10 +130,10 @@ def test_config_non_existent_shard_name(self): 'residual_at_end.yaml') self.assertEqual(sharder.get_num_shards(), 8) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Given shard index -1 is outside of expected range*'): sharder.get_output_table_suffix(-1) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Given shard index 8 is outside of expected range*'): sharder.get_output_table_suffix(8) @@ -236,7 +235,7 @@ def test_config_failed_missing_region(self): ' regions:', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions field missing.'): _ = variant_sharding.VariantSharding( @@ -251,7 +250,7 @@ def test_config_failed_missing_shard_name(self): ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix field missing.'): _ = variant_sharding.VariantSharding( @@ -264,7 +263,7 @@ def test_config_failed_missing_shard_name(self): ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix can not be empty.'): _ = variant_sharding.VariantSharding( @@ -290,7 +289,7 @@ def test_config_failed_duplicate_residual_shard(self): ' - "residual"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, there can be only one residual output*'): _ = variant_sharding.VariantSharding( @@ -311,7 +310,7 @@ def test_config_failed_overlapping_regions(self): ' - "chr1:999,999-2,000,000"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', @@ -329,7 +328,7 @@ def test_config_failed_overlapping_regions(self): ' - "chr1:1,000,000-2,000,000"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', @@ -347,7 +346,7 @@ def test_config_failed_overlapping_regions(self): ' - "chr1"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', @@ -370,7 +369,7 @@ def test_config_failed_overlapping_regions(self): ' - "chr1"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique*'): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', @@ -395,7 +394,7 @@ def test_config_failed_duplicate_table_name(self): ' - "chr1:1,000,000-2,000,000"', ' partition_range_end: 999999999', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table name suffixes must be unique*'): _ = variant_sharding.VariantSharding( @@ -412,7 +411,7 @@ def test_config_failed_missing_fields(self): ' - "1"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharing config file, output_table field missing.'): _ = variant_sharding.VariantSharding( @@ -426,7 +425,7 @@ def test_config_failed_missing_fields(self): ' - "1"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix field missing.'): _ = variant_sharding.VariantSharding( @@ -438,7 +437,7 @@ def test_config_failed_missing_fields(self): ' table_name_suffix: "chr1"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions field missing.'): _ = variant_sharding.VariantSharding( @@ -451,7 +450,7 @@ def test_config_failed_missing_fields(self): ' regions:', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions field missing.'): _ = variant_sharding.VariantSharding( @@ -465,7 +464,7 @@ def test_config_failed_missing_fields(self): ' - "chr1"', ' - "1"', ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, partition_range_end field missing.'): _ = variant_sharding.VariantSharding( @@ -482,7 +481,7 @@ def test_config_failed_wrong_fields(self): ' - "1"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix can not be empty.'): _ = variant_sharding.VariantSharding( @@ -498,7 +497,7 @@ def test_config_failed_wrong_fields(self): ' - "1"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, BigQuery table name can only contain *'): _ = variant_sharding.VariantSharding( @@ -518,7 +517,7 @@ def test_config_failed_wrong_fields(self): ' - "chr2"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table name suffixes must be unique*'): _ = variant_sharding.VariantSharding( @@ -534,7 +533,7 @@ def test_config_failed_wrong_fields(self): ' - " "', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, reference_name can not be empty string: '): _ = variant_sharding.VariantSharding( @@ -550,7 +549,7 @@ def test_config_failed_wrong_fields(self): ' - "dup_value"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique in config file: *'): _ = variant_sharding.VariantSharding( @@ -569,7 +568,7 @@ def test_config_failed_wrong_fields(self): ' - "dup_value"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, regions must be unique in config file: *'): _ = variant_sharding.VariantSharding( @@ -588,7 +587,7 @@ def test_config_failed_wrong_fields(self): ' - "residual"', ' partition_range_end: 249240615' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, there can be only one residual output *'): _ = variant_sharding.VariantSharding( @@ -603,7 +602,7 @@ def test_config_failed_wrong_fields(self): ' - "1"', ' partition_range_end: "not int"' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, each output table needs an integer for *'): _ = variant_sharding.VariantSharding( @@ -618,7 +617,7 @@ def test_config_failed_wrong_fields(self): ' - "1"', ' partition_range_end: -10' ] - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, each output table needs an integer for *'): _ = variant_sharding.VariantSharding( diff --git a/gcp_variant_transforms/libs/vcf_field_conflict_resolver.py b/gcp_variant_transforms/libs/vcf_field_conflict_resolver.py index cd542c454..3adca5c64 100644 --- a/gcp_variant_transforms/libs/vcf_field_conflict_resolver.py +++ b/gcp_variant_transforms/libs/vcf_field_conflict_resolver.py @@ -20,7 +20,7 @@ from gcp_variant_transforms.libs import bigquery_util -class FieldConflictResolver(object): +class FieldConflictResolver(): """A class for resolving all VCF field related mismatches. Example mismatch: conflict in definition of a VCF field (INFO, FORMAT, etc), diff --git a/gcp_variant_transforms/libs/vcf_field_conflict_resolver_test.py b/gcp_variant_transforms/libs/vcf_field_conflict_resolver_test.py index 75edc4af9..69ef5f4c9 100644 --- a/gcp_variant_transforms/libs/vcf_field_conflict_resolver_test.py +++ b/gcp_variant_transforms/libs/vcf_field_conflict_resolver_test.py @@ -209,7 +209,8 @@ def test_resolving_attribute_conflict_in_number_allele(self): self._resolver_allele.resolve_attribute_conflict( VcfParserHeaderKeyConstants.NUM, i, j) self.fail( - 'Should raise exception for unresolvable number: %d vs %d'%(i, j)) + 'Should raise exception for unresolvable number: {} vs {}'.format( + i, j)) def test_resolving_all_field_definition_conflict_in_type(self): self.assertEqual( diff --git a/gcp_variant_transforms/libs/vcf_file_composer.py b/gcp_variant_transforms/libs/vcf_file_composer.py index d134eaea1..433f158d6 100644 --- a/gcp_variant_transforms/libs/vcf_file_composer.py +++ b/gcp_variant_transforms/libs/vcf_file_composer.py @@ -137,7 +137,7 @@ def _create_blob(client, file_path): return file_blob -class MultiProcessComposer(object): +class MultiProcessComposer(): """Class to compose (a large number of) files in GCS in parallel.""" def __init__(self, project, bucket_name, blob_prefix): diff --git a/gcp_variant_transforms/libs/vcf_header_definitions_merger.py b/gcp_variant_transforms/libs/vcf_header_definitions_merger.py index 672e7e237..42554d8c3 100644 --- a/gcp_variant_transforms/libs/vcf_header_definitions_merger.py +++ b/gcp_variant_transforms/libs/vcf_header_definitions_merger.py @@ -27,7 +27,7 @@ vcf_header_io.VcfParserHeaderKeyConstants.TYPE]) -class VcfHeaderDefinitions(object): +class VcfHeaderDefinitions(): """Container for header definitions.""" def __init__(self, vcf_header=None): @@ -41,12 +41,12 @@ def __init__(self, vcf_header=None): self._formats = collections.defaultdict(dict) if not vcf_header: return - for key, val in vcf_header.infos.iteritems(): + for key, val in vcf_header.infos.items(): definition = Definition( val[vcf_header_io.VcfParserHeaderKeyConstants.NUM], val[vcf_header_io.VcfParserHeaderKeyConstants.TYPE]) self._infos[key][definition] = [vcf_header.file_path] - for key, val in vcf_header.formats.iteritems(): + for key, val in vcf_header.formats.items(): definition = Definition( val[vcf_header_io.VcfParserHeaderKeyConstants.NUM], val[vcf_header_io.VcfParserHeaderKeyConstants.TYPE]) @@ -64,7 +64,7 @@ def formats(self): return self._formats -class DefinitionsMerger(object): +class DefinitionsMerger(): """Class for merging two `VcfHeaderDefinitions`s.""" # For the same field definition, save at most `_MAX_NUM_FILE_NAMES` names. @@ -86,8 +86,8 @@ def _merge_definitions( ): # type: (...) -> None """Updates `first` by merging values from `first` and `second`.""" - for key, definitions_to_files_map in second.iteritems(): - for definition, file_names in definitions_to_files_map.iteritems(): + for key, definitions_to_files_map in second.items(): + for definition, file_names in definitions_to_files_map.items(): first[key].setdefault(definition, []) first[key][definition].extend(str(s) for s in file_names) first[key][definition] = ( diff --git a/gcp_variant_transforms/libs/vcf_header_parser.py b/gcp_variant_transforms/libs/vcf_header_parser.py index ae099c3ae..45f4912ac 100644 --- a/gcp_variant_transforms/libs/vcf_header_parser.py +++ b/gcp_variant_transforms/libs/vcf_header_parser.py @@ -14,7 +14,6 @@ """Helper library for reading VCF headers from multiple files.""" -from __future__ import absolute_import from pysam import libcbcf @@ -89,9 +88,9 @@ def _header_line_generator(file_name): with FileSystems.open(file_name) as f: record = None while True: - record = f.readline() + record = f.readline().decode('utf-8') while record and not record.strip(): # Skip empty lines. - record = f.readline() + record = f.readline().decode('utf-8') if record and record.startswith('#'): yield record else: diff --git a/gcp_variant_transforms/libs/vcf_header_parser_test.py b/gcp_variant_transforms/libs/vcf_header_parser_test.py index cb8fab22d..72cf76137 100644 --- a/gcp_variant_transforms/libs/vcf_header_parser_test.py +++ b/gcp_variant_transforms/libs/vcf_header_parser_test.py @@ -14,7 +14,7 @@ """Tests for vcf_header_parser module.""" -from __future__ import absolute_import + import unittest @@ -40,8 +40,8 @@ def test_one_file(self): with temp_dir.TempDir() as tempdir: file_path = self._create_temp_vcf_file(lines, tempdir) header_fields = vcf_header_parser.get_vcf_headers(file_path) - self.assertItemsEqual(['NS', 'AF'], header_fields.infos.keys()) - self.assertItemsEqual(['GT', 'GQ'], header_fields.formats.keys()) + self.assertCountEqual(['NS', 'AF'], list(header_fields.infos.keys())) + self.assertCountEqual(['GT', 'GQ'], list(header_fields.formats.keys())) def test_invalid_file(self): lines = [ diff --git a/gcp_variant_transforms/libs/vcf_reserved_fields.py b/gcp_variant_transforms/libs/vcf_reserved_fields.py index 7a10fccdb..7ae889d00 100644 --- a/gcp_variant_transforms/libs/vcf_reserved_fields.py +++ b/gcp_variant_transforms/libs/vcf_reserved_fields.py @@ -17,7 +17,6 @@ See http://samtools.github.io/hts-specs/VCFv4.3.pdf for more details. """ -from __future__ import absolute_import import collections from typing import Optional # pylint: disable=unused-import diff --git a/gcp_variant_transforms/options/variant_transform_options.py b/gcp_variant_transforms/options/variant_transform_options.py index a2aa9a55b..41b30840e 100644 --- a/gcp_variant_transforms/options/variant_transform_options.py +++ b/gcp_variant_transforms/options/variant_transform_options.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import argparse # pylint: disable=unused-import @@ -30,7 +29,7 @@ sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) -class VariantTransformsOptions(object): +class VariantTransformsOptions(): """Base class for defining groups of options for Variant Transforms. Transforms should create a derived class of ``VariantTransformsOptions`` @@ -48,7 +47,6 @@ def add_arguments(self, parser): def validate(self, parsed_args): # type: (argparse.Namespace) -> None """Validates this group's options parsed from the command line.""" - pass class VcfReadOptions(VariantTransformsOptions): diff --git a/gcp_variant_transforms/options/variant_transform_options_test.py b/gcp_variant_transforms/options/variant_transform_options_test.py index ee1e185e4..3e20d0ad2 100644 --- a/gcp_variant_transforms/options/variant_transform_options_test.py +++ b/gcp_variant_transforms/options/variant_transform_options_test.py @@ -112,7 +112,7 @@ def test_existing_sample_table(self): tableReference=bigquery.TableReference(projectId='project', datasetId='dataset', tableId='table__sample_info')) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'project:dataset.table__sample_info already exists'): self._options.validate(args, client) @@ -126,11 +126,10 @@ def side_effect(request): tableId='table__sample_info')): raise exceptions.HttpError(response={'status': '404'}, url='', content='') - else: - return bigquery.Table(tableReference=bigquery.TableReference( - projectId='project', - datasetId='dataset', - tableId='table__chr1_part1')) + return bigquery.Table(tableReference=bigquery.TableReference( + projectId='project', + datasetId='dataset', + tableId='table__chr1_part1')) args = self._make_args( ['--append', 'False', '--output_table', 'project:dataset.table', '--sharding_config_path', @@ -139,7 +138,7 @@ def side_effect(request): client = mock.Mock() client.tables.Get.side_effect = side_effect - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'project:dataset.table__chr01_part1 already exists'): self._options.validate(args, client) @@ -153,7 +152,7 @@ def test_missing_sample_table(self): client = mock.Mock() client.tables.Get.side_effect = exceptions.HttpError( response={'status': '404'}, url='', content='') - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'project:dataset.table__sample_info does not exist'): self._options.validate(args, client) @@ -181,7 +180,7 @@ def side_effect(request): client = mock.Mock() client.tables.Get.side_effect = side_effect - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'project:dataset.table__chr01_part1 does not exist'): self._options.validate(args, client) diff --git a/gcp_variant_transforms/pipeline_common.py b/gcp_variant_transforms/pipeline_common.py index b6daf8de9..61924a1ec 100644 --- a/gcp_variant_transforms/pipeline_common.py +++ b/gcp_variant_transforms/pipeline_common.py @@ -78,6 +78,9 @@ def parse_args(argv, command_line_options): if hasattr(known_args, 'input_pattern') or hasattr(known_args, 'input_file'): known_args.all_patterns = _get_all_patterns( known_args.input_pattern, known_args.input_file) + + # https://github.com/googlegenomics/gcp-variant-transforms/issues/667 + pipeline_args.extend(['--save_main_session', 'True']) return known_args, pipeline_args @@ -95,17 +98,15 @@ def _get_all_patterns(input_pattern, input_file): raise ValueError( 'Input pattern {} from {} did not match any files.'.format( match.pattern, input_file)) - else: - raise ValueError( - 'Input pattern {} did not match any files.'.format(match.pattern)) - except filesystem.BeamIOError: + raise ValueError( + 'Input pattern {} did not match any files.'.format(match.pattern)) + except filesystem.BeamIOError as e: if input_file: raise ValueError( 'Some patterns in {} are invalid or inaccessible.'.format( - input_file)) - else: - raise ValueError('Invalid or inaccessible input pattern {}.'.format( - input_pattern)) + input_file)) from e + raise ValueError('Invalid or inaccessible input pattern {}.'.format( + input_pattern)) from e return patterns @@ -151,7 +152,8 @@ def _get_file_names(input_file): if not filesystems.FileSystems.exists(input_file): raise ValueError('Input file {} doesn\'t exist'.format(input_file)) with filesystems.FileSystems.open(input_file) as f: - contents = map(str.strip, f.readlines()) + contents = list(map(str.strip, + [line.decode('utf-8') for line in f.readlines()])) if not contents: raise ValueError('Input file {} is empty.'.format(input_file)) return contents diff --git a/gcp_variant_transforms/pipeline_common_test.py b/gcp_variant_transforms/pipeline_common_test.py index 4103511a1..039da27aa 100644 --- a/gcp_variant_transforms/pipeline_common_test.py +++ b/gcp_variant_transforms/pipeline_common_test.py @@ -35,7 +35,8 @@ class PipelineCommonWithPatternTest(unittest.TestCase): """Tests cases for the `pipeline_common` script with pattern input.""" def _create_mock_args(self, **args): - return collections.namedtuple('MockArgs', args.keys())(*args.values()) + return collections.namedtuple( + 'MockArgs', list(args.keys()))(*list(args.values())) def _get_pipeline_mode(self, args): all_patterns = pipeline_common._get_all_patterns(args.input_pattern, @@ -43,7 +44,7 @@ def _get_pipeline_mode(self, args): return pipeline_common.get_pipeline_mode(all_patterns) def test_validation_failure_for_invalid_input_pattern(self): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Input pattern .* did not match any files.'): pipeline_common._get_all_patterns( input_pattern='nonexistent_file.vcf', input_file=None) @@ -60,11 +61,11 @@ def test_get_mode_medium(self): args = self._create_mock_args(input_pattern='*', input_file=None) match_result = collections.namedtuple('MatchResult', ['metadata_list']) - match = match_result(range(101)) + match = match_result(list(range(101))) with mock.patch.object(FileSystems, 'match', return_value=[match]): self.assertEqual(self._get_pipeline_mode(args), PipelineModes.MEDIUM) - match = match_result(range(50000)) + match = match_result(list(range(50000))) with mock.patch.object(FileSystems, 'match', return_value=[match]): self.assertEqual(self._get_pipeline_mode(args), PipelineModes.MEDIUM) @@ -72,7 +73,7 @@ def test_get_mode_large(self): args = self._create_mock_args(input_pattern='test', input_file=None) match_result = collections.namedtuple('MatchResult', ['metadata_list']) - match = match_result(range(50001)) + match = match_result(list(range(50001))) with mock.patch.object(FileSystems, 'match', return_value=[match]): self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE) @@ -82,7 +83,7 @@ def test_fail_on_invalid_flags(self): 'gcp-variant-transforms-test', '--staging_location', 'gs://integration_test_runs/staging'] - with self.assertRaisesRegexp(ValueError, 'job_name'): + with self.assertRaisesRegex(ValueError, 'job_name'): pipeline_common._raise_error_on_invalid_flags(pipeline_args) # Add job_name (required for Variant Transforms run). This is now valid. pipeline_args.extend(['--job_name', 'correct-01-job-name-02']) @@ -90,7 +91,7 @@ def test_fail_on_invalid_flags(self): # Add Dataflow runner (requires --setup_file). pipeline_args.extend(['--runner', 'DataflowRunner']) - with self.assertRaisesRegexp(ValueError, 'setup_file'): + with self.assertRaisesRegex(ValueError, 'setup_file'): pipeline_common._raise_error_on_invalid_flags(pipeline_args) # Add setup.py (required for Variant Transforms run). This is now valid. @@ -99,7 +100,7 @@ def test_fail_on_invalid_flags(self): # Add an unknown flag. pipeline_args.extend(['--unknown_flag', 'somevalue']) - with self.assertRaisesRegexp(ValueError, 'Unrecognized.*unknown_flag'): + with self.assertRaisesRegex(ValueError, 'Unrecognized.*unknown_flag'): pipeline_common._raise_error_on_invalid_flags(pipeline_args) def test_get_compression_type(self): @@ -165,7 +166,8 @@ class PipelineCommonWithFileTest(unittest.TestCase): def _create_mock_args(self, **args): - return collections.namedtuple('MockArgs', args.keys())(*args.values()) + return collections.namedtuple( + 'MockArgs', list(args.keys()))(*list(args.values())) def _get_pipeline_mode(self, args): all_patterns = pipeline_common._get_all_patterns(args.input_pattern, @@ -188,25 +190,25 @@ def test_get_mode_large(self): args = self._create_mock_args(input_pattern=None, input_file=filename) match_result = collections.namedtuple('MatchResult', ['metadata_list']) - match = match_result(range(50001)) + match = match_result(list(range(50001))) with mock.patch.object(FileSystems, 'match', return_value=[match]): self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE) - matches = [match_result(range(25000)), - match_result(range(25000)), - match_result(range(1))] + matches = [match_result(list(range(25000))), + match_result(list(range(25000))), + match_result(list(range(1)))] with mock.patch.object(FileSystems, 'match', return_value=matches): self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE) def test_validation_failure_for_invalid_input_file(self): - with self.assertRaisesRegexp(ValueError, 'Input file .* doesn\'t exist'): + with self.assertRaisesRegex(ValueError, 'Input file .* doesn\'t exist'): pipeline_common._get_all_patterns( input_pattern=None, input_file='nonexistent_file.vcf') def test_validation_failure_for_empty_input_file(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=[]) - with self.assertRaisesRegexp(ValueError, 'Input file .* is empty.'): + with self.assertRaisesRegex(ValueError, 'Input file .* is empty.'): pipeline_common._get_all_patterns( input_pattern=None, input_file=filename) @@ -216,7 +218,7 @@ def test_validation_failure_for_wrong_pattern_in_input_file(self): './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n'] with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=lines) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'Input pattern .* from .* did not match any files.'): pipeline_common._get_all_patterns( input_pattern=None, input_file=filename) diff --git a/gcp_variant_transforms/testing/asserts.py b/gcp_variant_transforms/testing/asserts.py index 46c9d1a09..ce6115d73 100644 --- a/gcp_variant_transforms/testing/asserts.py +++ b/gcp_variant_transforms/testing/asserts.py @@ -14,7 +14,6 @@ """Custom asserts for tests.""" -from __future__ import absolute_import from typing import Callable, List # pylint: disable=unused-import from apache_beam.testing.util import BeamAssertException @@ -24,11 +23,14 @@ def items_equal(expected): """Returns a function for checking expected and actual have the same items.""" def _items_equal(actual): - sorted_expected = sorted(expected) - sorted_actual = sorted(actual) - if sorted_expected != sorted_actual: - raise BeamAssertException( - 'Failed assert: %r != %r' % (sorted_expected, sorted_actual)) + compare = actual.copy() + for e in expected: + if e not in compare: + raise BeamAssertException( + 'Failed assert: %r != %r' % (expected, actual)) + compare.remove(e) + if compare: + raise BeamAssertException('Failed assert: %r != %r' % (expected, actual)) return _items_equal diff --git a/gcp_variant_transforms/testing/bigquery_schema_util.py b/gcp_variant_transforms/testing/bigquery_schema_util.py index 093b5f2a9..0f77430f6 100644 --- a/gcp_variant_transforms/testing/bigquery_schema_util.py +++ b/gcp_variant_transforms/testing/bigquery_schema_util.py @@ -14,7 +14,6 @@ """Utility functions for creating BigQuery schema used by unit tests.""" -from __future__ import absolute_import from apache_beam.io.gcp.internal.clients import bigquery diff --git a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/no_options.vcf b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/no_options.vcf index 05a2e10ba..a1141ceca 100644 --- a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/no_options.vcf +++ b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/no_options.vcf @@ -9,8 +9,8 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 1234567 microsat1 GTCT G,GTACT 50.0 PASS AA=G;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 -20 14370 rs6054257 G A 29.0 PASS H2;NS=3;DB;DP=14;AF=0.5 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. +19 1234567 microsat1 GTCT G,GTACT 50.0 PASS NS=3;DP=9;AA=G GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 +20 14370 rs6054257 G A 29.0 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. 20 17330 . T A 3.0 q10 NS=3;DP=11;AF=0.017 GT:DP:GQ:HQ 0|0:3:49:58,50 0|1:5:3:65,3 0/0:3:41:. -20 1110696 rs6040355 A G,T 67.0 PASS AA=T;NS=2;DB;DP=10;AF=0.333,0.667 GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. -20 1230237 . T . 47.0 PASS AA=T;NS=3;DP=13 GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. +20 1110696 rs6040355 A G,T 67.0 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. +20 1230237 . T . 47.0 PASS NS=3;DP=13;AA=T GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. diff --git a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_allow_incompatible_schema.vcf b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_allow_incompatible_schema.vcf index c39e793b0..c075efc37 100644 --- a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_allow_incompatible_schema.vcf +++ b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_allow_incompatible_schema.vcf @@ -11,16 +11,16 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 14370 rs6054257 G A 29.0 PASS H2;NS=3;DB;DP=14;AF=1.79769313486e+307 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. +19 14370 rs6054257 G A 29.0 PASS NS=3;DP=14;DB;H2;AF=1.7976931348623158e+307 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. 20 17330 . T A 3.0 q10 NS=3;DP=11;AF=0.017 GT:DP:GQ:HQ 0|0:3:49:58,50 0|1:5:3:65,3 0/0:3:41:. -20 1110696 rs6040355 A G,T 67.0 PASS AA=T;NS=2;DB;DP=10;AF=-1.79769313486e+307,0.667 GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. -20 1230237 . T . 47.0 PASS AA=T;NS=3;DP=13 GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. -20 1234567 microsat1 GTC G,GTCTC 50.0 PASS AA=G;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 -20 2234567 . C [13:123457[ACGC 50.0 PASS AA=G;SVTYPE=BÑD;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 -20 2234568 . C .TC 50.0 PASS AA=G;SVTYPE=BND;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 -20 2234569 . C CT. 50.0 PASS AA=G;SVTYPE=BND;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 -20 3234569 . C 50.0 PASS END=3235677;AA=G;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 -20 4234569 . N .[13:123457[ 50.0 PASS AA=G;SVTYPE=BND;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/1:2:17 ./.:3:40 -20 5234569 . N [13:123457[. 50.0 PASS AA=G;SVTYPE=BND;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 -HLA-A*01:01:01:01 1 . N T 50.0 PASS AA=G;NS=3;DP=9 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. +20 1110696 rs6040355 A G,T 67.0 PASS NS=2;DP=10;AA=T;DB;AF=-1.7976931348623158e+307,0.667 GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. +20 1230237 . T . 47.0 PASS NS=3;DP=13;AA=T GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. +20 1234567 microsat1 GTC G,GTCTC 50.0 PASS NS=3;DP=9;AA=G GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 +20 2234567 . C [13:123457[ACGC 50.0 PASS NS=3;DP=9;AA=G;SVTYPE=BÑD GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 +20 2234568 . C .TC 50.0 PASS NS=3;DP=9;AA=G;SVTYPE=BND GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 +20 2234569 . C CT. 50.0 PASS NS=3;DP=9;AA=G;SVTYPE=BND GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 +20 3234569 . C 50.0 PASS END=3235677;NS=3;DP=9;AA=G GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 +20 4234569 . N .[13:123457[ 50.0 PASS NS=3;DP=9;AA=G;SVTYPE=BND GT:DP:GQ 0/1:4:35 0/1:2:17 ./.:3:40 +20 5234569 . N [13:123457[. 50.0 PASS NS=3;DP=9;AA=G;SVTYPE=BND GT:DP:GQ 0/1:4:35 0/1:2:17 1/1:3:40 +HLA-A*01:01:01:01 1 . N T 50.0 PASS NS=3;DP=9;AA=G GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. Y 17330 . T A 3.0 q10 NS=3;DP=11 GT:GL 0:0,49 0:0,3 1:41,0 diff --git a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_customized_export.vcf b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_customized_export.vcf index a344cda42..2175ad18e 100644 --- a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_customized_export.vcf +++ b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_customized_export.vcf @@ -9,6 +9,6 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00003 -19 1234567 microsat1 GTCT G,GTACT 50.0 PASS AA=G;NS=3;DP=9 GT:DP:GQ 0/1:4:35 1/1:3:40 -20 14370 rs6054257 G A 29.0 PASS H2;NS=3;DB;DP=14;AF=0.5 GT:DP:GQ:HQ 0|0:1:48:51,51 1/1:5:43:. +19 1234567 microsat1 GTCT G,GTACT 50.0 PASS NS=3;DP=9;AA=G GT:DP:GQ 0/1:4:35 1/1:3:40 +20 14370 rs6054257 G A 29.0 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:DP:GQ:HQ 0|0:1:48:51,51 1/1:5:43:. 20 17330 . T A 3.0 q10 NS=3;DP=11;AF=0.017 GT:DP:GQ:HQ 0|0:3:49:58,50 0/0:3:41:. diff --git a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_preserve_sample_order.vcf b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_preserve_sample_order.vcf index 0ae4a7a46..2f7465bbd 100644 --- a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_preserve_sample_order.vcf +++ b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_preserve_sample_order.vcf @@ -9,7 +9,7 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 -19 1234567 microsat1;microsat2 GTCT G,GTACT 50.0 PASS AA=G;NS=2;DP=9 GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 .:.:. -20 14370 rs6054257 G A 30.0 PASS;q10 H2;NS=2;DB;DP=14;AF=0.5 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. 0|0:1:48:51,51 +19 1234567 microsat1;microsat2 GTCT G,GTACT 50.0 PASS NS=2;DP=9;AA=G GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 .:.:. +20 14370 rs6054257 G A 30.0 PASS;q10 NS=2;DP=14;DB;H2;AF=0.5 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. 0|0:1:48:51,51 20 17290 . T A 3.0 q10 NS=2;DP=11;AF=0.017 GT:DP:GQ:HQ 0|0:3:49:58,50 0|1:5:3:65,3 .:.:.:. .:.:.:. 20 17330 . T A 3.0 q10 NS=1;DP=11;AF=0.017 GT:DP:GQ .:.:. .:.:. 0/0:3:41 .:.:. diff --git a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_representative_header_file.vcf b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_representative_header_file.vcf index 502fba020..2277123fd 100644 --- a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_representative_header_file.vcf +++ b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_representative_header_file.vcf @@ -16,8 +16,8 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 1234567 microsat1 GTCT G,GTACT 50.0 PASS AA=G;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 -20 14370 rs6054257 G A 29.0 PASS H2;NS=3;DB;DP=14;AF=0.5 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. +19 1234567 microsat1 GTCT G,GTACT 50.0 PASS NS=3;DP=9;AA=G GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 +20 14370 rs6054257 G A 29.0 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. 20 17330 . T A 3.0 q10 NS=3;DP=11;AF=0.017 GT:DP:GQ:HQ 0|0:3:49:58,50 0|1:5:3:65,3 0/0:3:41:. -20 1110696 rs6040355 A G,T 67.0 PASS AA=T;NS=2;DB;DP=10;AF=0.333,0.667 GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. -20 1230237 . T . 47.0 PASS AA=T;NS=3;DP=13 GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. +20 1110696 rs6040355 A G,T 67.0 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. +20 1230237 . T . 47.0 PASS NS=3;DP=13;AA=T GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. diff --git a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_use_1_based_coordinate.vcf b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_use_1_based_coordinate.vcf index 61571fa17..e92e77eac 100644 --- a/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_use_1_based_coordinate.vcf +++ b/gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_use_1_based_coordinate.vcf @@ -9,8 +9,8 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -19 1234566 microsat1 GTCT G,GTACT 50.0 PASS END=1234570;AA=G;NS=3;DP=9 GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 -20 14369 rs6054257 G A 29.0 PASS END=14370;H2;NS=3;DB;DP=14;AF=0.5 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. +19 1234566 microsat1 GTCT G,GTACT 50.0 PASS END=1234570;NS=3;DP=9;AA=G GT:DP:GQ 0/1:4:35 0/2:2:17 1/1:3:40 +20 14369 rs6054257 G A 29.0 PASS END=14370;NS=3;DP=14;AF=0.5;DB;H2 GT:DP:GQ:HQ 0|0:1:48:51,51 1|0:8:48:51,51 1/1:5:43:. 20 17329 . T A 3.0 q10 END=17330;NS=3;DP=11;AF=0.017 GT:DP:GQ:HQ 0|0:3:49:58,50 0|1:5:3:65,3 0/0:3:41:. -20 1110695 rs6040355 A G,T 67.0 PASS END=1110696;AA=T;NS=2;DB;DP=10;AF=0.333,0.667 GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. -20 1230236 . T . 47.0 PASS END=1230237;AA=T;NS=3;DP=13 GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. +20 1110695 rs6040355 A G,T 67.0 PASS END=1110696;NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:DP:GQ:HQ 1|2:6:21:23,27 2|1:0:2:18,2 2/2:4:35:. +20 1230236 . T . 47.0 PASS END=1230237;NS=3;DP=13;AA=T GT:DP:GQ:HQ 0|0:7:54:56,60 0|0:4:48:51,51 0/0:2:61:. diff --git a/gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_number_of_bases_per_shard.json b/gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_number_of_bases_per_shard.json index d471b8adc..e6f386e6a 100644 --- a/gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_number_of_bases_per_shard.json +++ b/gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_number_of_bases_per_shard.json @@ -6,6 +6,6 @@ "number_of_bases_per_shard": 100000, "bq_uses_1_based_coordinate": false, "runner": "DataflowRunner", - "expected_output_file": "gs://gcp-variant-transforms-testfiles/bq_to_vcf_expected_output/platinum_NA12877_hg38_10K_lines_v2.vcf" + "expected_output_file": "gs://gcp-variant-transforms-testfiles/bq_to_vcf_expected_output/platinum_NA12877_hg38_10K_lines.vcf" } ] diff --git a/gcp_variant_transforms/testing/integration/preprocessor_tests/bgzf_preprocessor.json b/gcp_variant_transforms/testing/integration/preprocessor_tests/bgzf_preprocessor.json index 46fffbb86..91ee73be8 100644 --- a/gcp_variant_transforms/testing/integration/preprocessor_tests/bgzf_preprocessor.json +++ b/gcp_variant_transforms/testing/integration/preprocessor_tests/bgzf_preprocessor.json @@ -8,8 +8,8 @@ "expected_contents": [ "Header Conflicts", "ID\tCategory\tConflicts\tFile Paths\tProposed Resolution", - "GL\tFORMAT\tnum=3 type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\tnum=. type=Float", - " \t \tnum=. type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz\t ", + "GL\tFORMAT\tnum=. type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz\tnum=. type=Float", + " \t \tnum=3 type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\t ", "GQ\tFORMAT\tnum=1 type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz\tnum=1 type=Float", " \t \tnum=1 type=Integer\tgs://gcp-variant-transforms-testfiles/small_tests/bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\t ", "", diff --git a/gcp_variant_transforms/testing/integration/preprocessor_tests/non_splittable_bgzf_preprocessor.json b/gcp_variant_transforms/testing/integration/preprocessor_tests/non_splittable_bgzf_preprocessor.json index afcf58218..deeafd407 100644 --- a/gcp_variant_transforms/testing/integration/preprocessor_tests/non_splittable_bgzf_preprocessor.json +++ b/gcp_variant_transforms/testing/integration/preprocessor_tests/non_splittable_bgzf_preprocessor.json @@ -9,8 +9,8 @@ "expected_contents": [ "Header Conflicts", "ID\tCategory\tConflicts\tFile Paths\tProposed Resolution", - "GL\tFORMAT\tnum=3 type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/non_splittable_bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\tnum=. type=Float", - " \t \tnum=. type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/non_splittable_bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz\t ", + "GL\tFORMAT\tnum=. type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/non_splittable_bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz\tnum=. type=Float", + " \t \tnum=3 type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/non_splittable_bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\t ", "GQ\tFORMAT\tnum=1 type=Float\tgs://gcp-variant-transforms-testfiles/small_tests/non_splittable_bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz\tnum=1 type=Float", " \t \tnum=1 type=Integer\tgs://gcp-variant-transforms-testfiles/small_tests/non_splittable_bgzf_preprocessor/1000-genomes_vcf_ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\t ", "", diff --git a/gcp_variant_transforms/testing/integration/run_bq_to_vcf_tests.py b/gcp_variant_transforms/testing/integration/run_bq_to_vcf_tests.py index 9e43a7db9..4431e2d33 100644 --- a/gcp_variant_transforms/testing/integration/run_bq_to_vcf_tests.py +++ b/gcp_variant_transforms/testing/integration/run_bq_to_vcf_tests.py @@ -71,7 +71,7 @@ def __init__(self, '--temp_location {}'.format(parsed_args.temp_location), '--job_name {}'.format( ''.join([test_name, timestamp]).replace('_', '-'))] - for k, v in kwargs.iteritems(): + for k, v in kwargs.items(): args.append('--{} {}'.format(k, v)) self.run_test_command = run_tests_common.form_command( @@ -140,7 +140,7 @@ def main(): if __name__ == '__main__': - print 'Starting bq_to_vcf tests...' + print('Starting bq_to_vcf tests...') ret_code = main() - print 'Finished all bq_to_vcf tests successfully.' + print('Finished all bq_to_vcf tests successfully.') sys.exit(ret_code) diff --git a/gcp_variant_transforms/testing/integration/run_preprocessor_tests.py b/gcp_variant_transforms/testing/integration/run_preprocessor_tests.py index 99a24bee8..3992adf1f 100644 --- a/gcp_variant_transforms/testing/integration/run_preprocessor_tests.py +++ b/gcp_variant_transforms/testing/integration/run_preprocessor_tests.py @@ -82,7 +82,7 @@ def __init__(self, _BUCKET_NAME, self._header_blob_name]) args.append('--resolved_headers_path {}'.format(self._header_path)) - for k, v in kwargs.iteritems(): + for k, v in kwargs.items(): args.append('--{} {}'.format(k, v)) self.run_test_command = run_tests_common.form_command( @@ -107,7 +107,7 @@ def validate_result(self): raise run_tests_common.TestCaseFailure( 'Report is not generated in {} in test {}'.format(self._report_path, self._name)) - contents = report_blob.download_as_string() + contents = report_blob.download_as_string().decode('utf-8') expected_contents = '\n'.join(self._expected_contents) if expected_contents != contents: raise run_tests_common.TestCaseFailure( @@ -169,7 +169,7 @@ def main(): if __name__ == '__main__': - print 'Starting preprocessor tests...' + print('Starting preprocessor tests...') ret_code = main() - print 'Finished all preprocessor tests successfully.' + print('Finished all preprocessor tests successfully.') sys.exit(ret_code) diff --git a/gcp_variant_transforms/testing/integration/run_tests_common.py b/gcp_variant_transforms/testing/integration/run_tests_common.py index 09b72f048..0400703eb 100644 --- a/gcp_variant_transforms/testing/integration/run_tests_common.py +++ b/gcp_variant_transforms/testing/integration/run_tests_common.py @@ -36,7 +36,7 @@ ['running_test', 'remaining_tests']) -class TestCaseInterface(object): +class TestCaseInterface(): """Interface of an integration test case.""" def validate_result(self): @@ -46,10 +46,9 @@ def validate_result(self): class TestCaseFailure(Exception): """Exception for failed test cases.""" - pass -class TestRunner(object): +class TestRunner(): """Runs the tests using pipelines API.""" def __init__(self, tests, revalidate=False): @@ -92,13 +91,13 @@ def _run_test(self, test_cases): {test_cases[0].get_name(): subprocess.Popen( test_cases[0].run_test_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)}) - print 'Started executing: {}'.format(test_cases[0].get_name()) + print('Started executing: {}'.format(test_cases[0].get_name())) def _wait_for_all_operations_done(self): """Waits until all operations are done.""" while self._test_names_to_processes: time.sleep(10) - running_test_names = self._test_names_to_processes.keys() + running_test_names = list(self._test_names_to_processes.keys()) for test_name in running_test_names: running_proc = self._test_names_to_processes.get(test_name) return_code = running_proc.poll() @@ -106,14 +105,14 @@ def _wait_for_all_operations_done(self): test_case_state = self._test_names_to_test_states.get(test_name) self._handle_failure(running_proc, test_case_state.running_test) del self._test_names_to_processes[test_name] - print 'Started validating: {}'.format(test_name) + print('Started validating: {}'.format(test_name)) test_case_state.running_test.validate_result() self._run_test(test_case_state.remaining_tests) def _handle_failure(self, proc, test_case): """Raises errors if test case failed.""" if proc.returncode != 0: - print 'ERROR: Test execution failed: {}'.format(test_case.get_name()) + print('ERROR: Test execution failed: {}'.format(test_case.get_name())) stdout, stderr = proc.communicate() raise TestCaseFailure('Test case {} failed. stdout: {}, stderr: {}, ' 'return code: {}.'.format(test_case.get_name(), @@ -124,7 +123,7 @@ def print_results(self): """Prints results of test cases.""" for test_cases in self._tests: for test_case in test_cases: - print '{} ...ok'.format(test_case.get_name()) + print('{} ...ok'.format(test_case.get_name())) return 0 diff --git a/gcp_variant_transforms/testing/integration/run_vcf_to_bq_tests.py b/gcp_variant_transforms/testing/integration/run_vcf_to_bq_tests.py index 30668721c..297adc785 100644 --- a/gcp_variant_transforms/testing/integration/run_vcf_to_bq_tests.py +++ b/gcp_variant_transforms/testing/integration/run_vcf_to_bq_tests.py @@ -40,7 +40,7 @@ """ import argparse -from concurrent.futures import TimeoutError +from concurrent.futures import TimeoutError as ConcurrentTimeoutError import enum import os import sys @@ -84,9 +84,9 @@ def __init__(self, '--temp_location {}'.format(context.temp_location), '--job_name {}-{}'.format(test_name, self._dataset_id.replace('_', '-'))] - for k, v in kwargs.iteritems(): + for k, v in kwargs.items(): value = v - if isinstance(v, basestring): + if isinstance(v, str): value = v.format(TABLE_NAME=full_table_id) args.append('--{} {}'.format(k, value)) self.run_test_command = run_tests_common.form_command( @@ -110,7 +110,7 @@ def get_name(self): return self._name -class QueryAssertion(object): +class QueryAssertion(): """Runs a query and verifies that the output matches the expected result.""" def __init__(self, client, test_name, query, expected_result): @@ -126,8 +126,8 @@ def run_assertion(self): while True: try: results = query_job.result(timeout=300) - except TimeoutError as e: - print 'WARNING: Time out waiting for query: {}'.format(self._query) + except ConcurrentTimeoutError as e: + print('WARNING: Time out waiting for query: {}'.format(self._query)) if num_retries < _NUM_QUERY_RETIRES: num_retries += 1 time.sleep(90) @@ -136,23 +136,22 @@ def run_assertion(self): else: if results.total_rows == 1: break + print('ERROR: Query `{}` did not return expected num rows: {}'.format( + self._query, results.total_rows)) + if num_retries < _NUM_QUERY_RETIRES: + num_retries += 1 + time.sleep(90) else: - print 'ERROR: Query `{}` did not return expected num rows: {}'.format( - self._query, results.total_rows) - if num_retries < _NUM_QUERY_RETIRES: - num_retries += 1 - time.sleep(90) - else: - raise run_tests_common.TestCaseFailure( - 'Expected 1 row query results instead got {} in test {}'.format( - results.total_rows, self._test_name)) + raise run_tests_common.TestCaseFailure( + 'Expected 1 row query results instead got {} in test {}'.format( + results.total_rows, self._test_name)) row = list(results)[0] - col_names = row.keys() + col_names = list(row.keys()) if set(self._expected_result.keys()) != set(col_names): raise run_tests_common.TestCaseFailure( 'Expected {} columns in the query result, got {} in test {}'.format( - self._expected_result.keys(), col_names, self._test_name)) + list(self._expected_result.keys()), col_names, self._test_name)) for key in self._expected_result.keys(): if self._expected_result.get(key) != row.get(key): raise run_tests_common.TestCaseFailure( @@ -160,7 +159,7 @@ def run_assertion(self): key, self._expected_result[key], row.get(key), self._test_name)) -class QueryFormatter(object): +class QueryFormatter(): """Formats a query. Replaces macros and variables in the query. @@ -201,7 +200,7 @@ def _replace_macros(self, query): return query -class TestContextManager(object): +class TestContextManager(): """Manages all resources for a given run of tests. Responsible for setting up tests (i.e. creating a unique dataset) and @@ -335,7 +334,7 @@ def main(): if __name__ == '__main__': - print 'Starting vcf_to_bq tests...' + print('Starting vcf_to_bq tests...') ret_code = main() - print 'Finished all vcf_to_bq tests successfully.' + print('Finished all vcf_to_bq tests successfully.') sys.exit(ret_code) diff --git a/gcp_variant_transforms/testing/temp_dir.py b/gcp_variant_transforms/testing/temp_dir.py index d578398f5..6debd8609 100644 --- a/gcp_variant_transforms/testing/temp_dir.py +++ b/gcp_variant_transforms/testing/temp_dir.py @@ -14,7 +14,6 @@ """Utility functions and classes for testing.""" -from __future__ import absolute_import import bz2 import gzip @@ -27,7 +26,7 @@ __all__ = ['TempDir'] -class TempDir(object): +class TempDir(): """Context Manager to create and clean-up a temporary directory.""" def __init__(self): @@ -65,15 +64,16 @@ def create_temp_file( suffix=suffix) if not lines: return f.name + data_to_write = b''.join([line.encode('utf-8') for line in lines]) if compression_type in (filesystem.CompressionTypes.UNCOMPRESSED, filesystem.CompressionTypes.AUTO): - f.write(''.join(lines)) + f.write(data_to_write) elif compression_type == filesystem.CompressionTypes.GZIP: with gzip.GzipFile(f.name, 'w') as gzip_file: - gzip_file.write(''.join(lines)) + gzip_file.write(data_to_write) elif compression_type == filesystem.CompressionTypes.BZIP2: with bz2.BZ2File(f.name, 'w') as bzip_file: - bzip_file.write(''.join(lines)) + bzip_file.write(data_to_write) else: raise ValueError('Unsupported CompressionType.') diff --git a/gcp_variant_transforms/testing/testdata_util.py b/gcp_variant_transforms/testing/testdata_util.py index 88934eeb6..f8b08ca8f 100644 --- a/gcp_variant_transforms/testing/testdata_util.py +++ b/gcp_variant_transforms/testing/testdata_util.py @@ -14,7 +14,6 @@ """Util functions for accessing testdata.""" -from __future__ import absolute_import import os.path diff --git a/gcp_variant_transforms/testing/vcf_header_util.py b/gcp_variant_transforms/testing/vcf_header_util.py index b6a3461a8..173f9648c 100644 --- a/gcp_variant_transforms/testing/vcf_header_util.py +++ b/gcp_variant_transforms/testing/vcf_header_util.py @@ -14,7 +14,6 @@ """Utility functions for creating VcfHeader objects used by unit tests.""" -from __future__ import absolute_import from gcp_variant_transforms.beam_io import vcf_header_io @@ -27,7 +26,7 @@ def make_header(header_num_dict): header_num_dict: a dictionary mapping info keys to string num values. """ infos = {} - for k, v in header_num_dict.iteritems(): + for k, v in header_num_dict.items(): num_field_value = v if v in vcf_header_io.HEADER_SPECIAL_NUMBERS else int(v) infos[k] = vcf_header_io.CreateInfoField(k, num_field_value, '.', '') return vcf_header_io.VcfHeader(infos=infos) diff --git a/gcp_variant_transforms/transforms/annotate_files.py b/gcp_variant_transforms/transforms/annotate_files.py index b2bd9664e..cf489c191 100644 --- a/gcp_variant_transforms/transforms/annotate_files.py +++ b/gcp_variant_transforms/transforms/annotate_files.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import threading import time @@ -55,7 +54,7 @@ def process(self, input_pattern): t.start() while t.isAlive(): with filesystems.FileSystems.create(watchdog_file) as file_to_write: - file_to_write.write('Watchdog file.') + file_to_write.write(b'Watchdog file.') time.sleep(_WATCHDOG_FILE_UPDATE_INTERVAL_SECONDS) def _annotate_files(self, input_pattern, watchdog_file): diff --git a/gcp_variant_transforms/transforms/bigquery_to_variant_test.py b/gcp_variant_transforms/transforms/bigquery_to_variant_test.py index 73082f6cf..7dfb19705 100644 --- a/gcp_variant_transforms/transforms/bigquery_to_variant_test.py +++ b/gcp_variant_transforms/transforms/bigquery_to_variant_test.py @@ -31,35 +31,35 @@ class BigQueryToVariantTest(unittest.TestCase): """Test cases for `BigQueryToVariant` transform.""" def _get_bigquery_row_and_variant(self): - row = {unicode(ColumnKeyConstants.REFERENCE_NAME): unicode('chr19'), - unicode(ColumnKeyConstants.START_POSITION): 11, - unicode(ColumnKeyConstants.END_POSITION): 12, - unicode(ColumnKeyConstants.REFERENCE_BASES): 'C', - unicode(ColumnKeyConstants.NAMES): ['rs1', 'rs2'], - unicode(ColumnKeyConstants.QUALITY): 2, - unicode(ColumnKeyConstants.FILTER): ['PASS'], - unicode(ColumnKeyConstants.CALLS): [ - {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( + row = {str(ColumnKeyConstants.REFERENCE_NAME): str('chr19'), + str(ColumnKeyConstants.START_POSITION): 11, + str(ColumnKeyConstants.END_POSITION): 12, + str(ColumnKeyConstants.REFERENCE_BASES): 'C', + str(ColumnKeyConstants.NAMES): ['rs1', 'rs2'], + str(ColumnKeyConstants.QUALITY): 2, + str(ColumnKeyConstants.FILTER): ['PASS'], + str(ColumnKeyConstants.CALLS): [ + {str(ColumnKeyConstants.CALLS_SAMPLE_ID): ( hash_name('Sample1')), - unicode(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], - unicode(ColumnKeyConstants.CALLS_PHASESET): unicode('*'), - unicode('GQ'): 20, unicode('FIR'): [10, 20]}, - {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( + str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], + str(ColumnKeyConstants.CALLS_PHASESET): str('*'), + str('GQ'): 20, str('FIR'): [10, 20]}, + {str(ColumnKeyConstants.CALLS_SAMPLE_ID): ( hash_name('Sample2')), - unicode(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], - unicode(ColumnKeyConstants.CALLS_PHASESET): None, - unicode('GQ'): 10, unicode('FB'): True} + str(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], + str(ColumnKeyConstants.CALLS_PHASESET): None, + str('GQ'): 10, str('FB'): True} ], - unicode(ColumnKeyConstants.ALTERNATE_BASES): [ - {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('A'), - unicode('IFR'): None, - unicode('IFR2'): 0.2}, - {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('TT'), - unicode('IFR'): 0.2, - unicode('IFR2'): 0.3} + str(ColumnKeyConstants.ALTERNATE_BASES): [ + {str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('A'), + str('IFR'): None, + str('IFR2'): 0.2}, + {str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('TT'), + str('IFR'): 0.2, + str('IFR2'): 0.3} ], - unicode('IS'): unicode('some data'), - unicode('ISR'): [unicode('data1'), unicode('data2')]} + str('IS'): str('some data'), + str('ISR'): [str('data1'), str('data2')]} variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, diff --git a/gcp_variant_transforms/transforms/densify_variants.py b/gcp_variant_transforms/transforms/densify_variants.py index bd85e1368..1ba07d10a 100644 --- a/gcp_variant_transforms/transforms/densify_variants.py +++ b/gcp_variant_transforms/transforms/densify_variants.py @@ -14,7 +14,6 @@ """A PTransform to extend each Variant's calls with data for all samples.""" -from __future__ import absolute_import import apache_beam as beam @@ -55,7 +54,7 @@ def _densify_variants(self, variant, all_sample_ids): new_calls = [] for sample_id in all_sample_ids: - if sample_id in existing_sample_ids.keys(): + if sample_id in list(existing_sample_ids.keys()): new_calls.append(existing_sample_ids.get(sample_id)) else: new_calls.append( diff --git a/gcp_variant_transforms/transforms/densify_variants_test.py b/gcp_variant_transforms/transforms/densify_variants_test.py index b6d5d1b2e..c2b346691 100644 --- a/gcp_variant_transforms/transforms/densify_variants_test.py +++ b/gcp_variant_transforms/transforms/densify_variants_test.py @@ -14,7 +14,6 @@ """Tests for densify_variants module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/extract_input_size.py b/gcp_variant_transforms/transforms/extract_input_size.py index c26ae9b72..018840848 100644 --- a/gcp_variant_transforms/transforms/extract_input_size.py +++ b/gcp_variant_transforms/transforms/extract_input_size.py @@ -22,7 +22,6 @@ - file count: number of input files. """ -from __future__ import absolute_import import apache_beam as beam @@ -86,8 +85,9 @@ def print_estimates_to_file(variant_count, file_count, file_path): with filesystems.FileSystems.create(file_path) as file_to_write: - file_to_write.write('{}\n{}\n{}\n{}\n{}\n'.format(int(variant_count), - sample_count, - int(value_count), - files_size, - file_count)) + file_to_write.write(('{}\n{}\n{}\n{}\n{}\n'.format( + int(variant_count), + sample_count, + int(value_count), + files_size, + file_count)).encode('utf-8')) diff --git a/gcp_variant_transforms/transforms/filter_variants.py b/gcp_variant_transforms/transforms/filter_variants.py index 382f95cc0..c77a075b7 100644 --- a/gcp_variant_transforms/transforms/filter_variants.py +++ b/gcp_variant_transforms/transforms/filter_variants.py @@ -14,7 +14,6 @@ """A PTransform for filtering variants.""" -from __future__ import absolute_import from typing import Iterable # pylint: disable=unused-import import logging diff --git a/gcp_variant_transforms/transforms/filter_variants_test.py b/gcp_variant_transforms/transforms/filter_variants_test.py index b5e8de04e..1e9f2885f 100644 --- a/gcp_variant_transforms/transforms/filter_variants_test.py +++ b/gcp_variant_transforms/transforms/filter_variants_test.py @@ -14,7 +14,6 @@ """Tests for filter_variants module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/infer_headers.py b/gcp_variant_transforms/transforms/infer_headers.py index 55732d4ce..672608ee0 100644 --- a/gcp_variant_transforms/transforms/infer_headers.py +++ b/gcp_variant_transforms/transforms/infer_headers.py @@ -14,7 +14,6 @@ """A PTransform to infer undefined/mismatched header fields.""" -from __future__ import absolute_import from typing import Iterable, List, Optional # pylint: disable=unused-import diff --git a/gcp_variant_transforms/transforms/infer_headers_test.py b/gcp_variant_transforms/transforms/infer_headers_test.py index f68c8dcf6..8b9189039 100644 --- a/gcp_variant_transforms/transforms/infer_headers_test.py +++ b/gcp_variant_transforms/transforms/infer_headers_test.py @@ -14,7 +14,6 @@ """Tests for infer_headers module.""" -from __future__ import absolute_import from collections import OrderedDict import unittest @@ -304,7 +303,7 @@ def test_infer_annotation_types_no_conflicts(self): expected_types = {'CSQ_Gene_TYPE': 'String', 'CSQ_Position_TYPE': 'Integer', 'CSQ_Score_TYPE': 'Float'} - for key, item in inferred_headers.infos.iteritems(): + for key, item in inferred_headers.infos.items(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos)) @@ -321,7 +320,7 @@ def test_infer_annotation_types_with_type_conflicts(self): expected_types = {'CSQ_Gene_TYPE': 'Float', 'CSQ_Position_TYPE': 'String', 'CSQ_Score_TYPE': 'Float'} - for key, item in inferred_headers.infos.iteritems(): + for key, item in inferred_headers.infos.items(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos)) @@ -338,7 +337,7 @@ def test_infer_annotation_types_with_missing(self): expected_types = {'CSQ_Gene_TYPE': '.', 'CSQ_Position_TYPE': 'Integer', 'CSQ_Score_TYPE': 'Float'} - for key, item in inferred_headers.infos.iteritems(): + for key, item in inferred_headers.infos.items(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos)) @@ -370,7 +369,7 @@ def test_infer_annotation_types_with_multiple_annotation_fields(self): 'CSQ_VT_Gene_TYPE': 'Integer', 'CSQ_VT_Position_TYPE': 'Integer', 'CSQ_VT_Score_TYPE': 'Float'} - for key, item in inferred_headers.infos.iteritems(): + for key, item in inferred_headers.infos.items(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos)) diff --git a/gcp_variant_transforms/transforms/limit_write.py b/gcp_variant_transforms/transforms/limit_write.py index 0f910dc71..19102c9f2 100644 --- a/gcp_variant_transforms/transforms/limit_write.py +++ b/gcp_variant_transforms/transforms/limit_write.py @@ -21,7 +21,6 @@ are BigQuery rows) based on that key before writing them to output table. """ -from __future__ import absolute_import import random import apache_beam as beam diff --git a/gcp_variant_transforms/transforms/limit_write_test.py b/gcp_variant_transforms/transforms/limit_write_test.py index 980e1cc87..e01f95b35 100644 --- a/gcp_variant_transforms/transforms/limit_write_test.py +++ b/gcp_variant_transforms/transforms/limit_write_test.py @@ -14,7 +14,6 @@ """Tests for limit_write module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/merge_header_definitions.py b/gcp_variant_transforms/transforms/merge_header_definitions.py index f926fb103..67d3c7342 100644 --- a/gcp_variant_transforms/transforms/merge_header_definitions.py +++ b/gcp_variant_transforms/transforms/merge_header_definitions.py @@ -29,7 +29,7 @@ class _MergeDefinitionsFn(beam.CombineFn): def __init__(self, definitions_merger): # type: (vcf_header_definitions_merger.DefinitionsMerger) -> None - super(_MergeDefinitionsFn, self).__init__() + super().__init__() self._definitions_merger = definitions_merger def create_accumulator(self): @@ -66,7 +66,7 @@ class MergeDefinitions(beam.PTransform): def __init__(self): """Initializes `MergeDefinitions` object.""" - super(MergeDefinitions, self).__init__() + super().__init__() self._definitions_merger = vcf_header_definitions_merger.DefinitionsMerger() def expand(self, pcoll): diff --git a/gcp_variant_transforms/transforms/merge_headers.py b/gcp_variant_transforms/transforms/merge_headers.py index cef158d5d..ed6257270 100644 --- a/gcp_variant_transforms/transforms/merge_headers.py +++ b/gcp_variant_transforms/transforms/merge_headers.py @@ -29,7 +29,7 @@ class _MergeHeadersFn(beam.CombineFn): def __init__(self, merger): # type: (HeaderMerger) -> None - super(_MergeHeadersFn, self).__init__() + super().__init__() self._header_merger = merger def create_accumulator(self): @@ -73,7 +73,7 @@ def __init__(self, allow_incompatible_records: If true, header definition with type mismatch (e.g., string vs float) are always resolved. """ - super(MergeHeaders, self).__init__() + super().__init__() # Resolver makes extra efforts to resolve conflict in header definitions # when flag allow_incompatible_records is set. For example, it resolves # type conflict of string and float into string. diff --git a/gcp_variant_transforms/transforms/merge_headers_test.py b/gcp_variant_transforms/transforms/merge_headers_test.py index c8540ef77..2505c30cf 100644 --- a/gcp_variant_transforms/transforms/merge_headers_test.py +++ b/gcp_variant_transforms/transforms/merge_headers_test.py @@ -73,8 +73,8 @@ def test_combine_single_header(self): merged_headers = combiner_fn.add_input(merged_headers, headers) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS', 'AF']) - self.assertItemsEqual(merged_headers.formats.keys(), ['GT', 'GQ']) + self.assertCountEqual(list(merged_headers.infos.keys()), ['NS', 'AF']) + self.assertCountEqual(list(merged_headers.formats.keys()), ['GT', 'GQ']) def test_combine_multiple_headers_as_inputs(self): headers_1 = self._get_header_from_lines(FILE_1_LINES) @@ -87,8 +87,10 @@ def test_combine_multiple_headers_as_inputs(self): merged_headers = combiner_fn.add_input(merged_headers, headers_2) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS', 'AF', 'NS2']) - self.assertItemsEqual(merged_headers.formats.keys(), ['GT', 'GQ', 'GQ2']) + self.assertCountEqual(list(merged_headers.infos.keys()), + ['NS', 'AF', 'NS2']) + self.assertCountEqual(list(merged_headers.formats.keys()), + ['GT', 'GQ', 'GQ2']) def test_combine_multiple_headers_as_accumulators(self): headers_1 = self._get_header_from_lines(FILE_1_LINES) @@ -104,8 +106,10 @@ def test_combine_multiple_headers_as_accumulators(self): merged_headers_2]) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS', 'AF', 'NS2']) - self.assertItemsEqual(merged_headers.formats.keys(), ['GT', 'GQ', 'GQ2']) + self.assertCountEqual(list(merged_headers.infos.keys()), + ['NS', 'AF', 'NS2']) + self.assertCountEqual(list(merged_headers.formats.keys()), + ['GT', 'GQ', 'GQ2']) def test_combine_two_type_conflicting_but_resolvable_headers(self): # These two headers have type conflict (Integer vs Float), however pipeline @@ -129,8 +133,8 @@ def test_combine_two_type_conflicting_but_resolvable_headers(self): merged_headers = combiner_fn.add_input(merged_headers, headers_2) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS']) - self.assertItemsEqual(merged_headers.infos['NS'], + self.assertCountEqual(list(merged_headers.infos.keys()), ['NS']) + self.assertCountEqual(merged_headers.infos['NS'], OrderedDict([('id', 'NS'), ('num', 1), ('type', 'Float'), @@ -153,8 +157,8 @@ def test_none_type_defaults_to_string(self): merged_headers = combiner_fn.add_input(merged_headers, headers) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS']) - self.assertItemsEqual(merged_headers.infos['NS'], + self.assertCountEqual(list(merged_headers.infos.keys()), ['NS']) + self.assertCountEqual(merged_headers.infos['NS'], OrderedDict([('id', 'NS'), ('num', 1), ('type', 'String'), @@ -184,8 +188,8 @@ def test_combine_two_num_conflicting_but_resolvable_headers_1(self): merged_headers = combiner_fn.add_input(merged_headers, headers_2) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS']) - self.assertItemsEqual(merged_headers.infos['NS'], + self.assertCountEqual(list(merged_headers.infos.keys()), ['NS']) + self.assertCountEqual(merged_headers.infos['NS'], OrderedDict([('id', 'NS'), ('num', '.'), ('type', 'Integer'), @@ -215,8 +219,8 @@ def test_combine_two_num_conflicting_but_resolvable_headers_2(self): merged_headers = combiner_fn.add_input(merged_headers, headers_2) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS']) - self.assertItemsEqual(merged_headers.infos['NS'], + self.assertCountEqual(list(merged_headers.infos.keys()), ['NS']) + self.assertCountEqual(merged_headers.infos['NS'], OrderedDict([('id', 'NS'), ('num', '.'), ('type', 'Integer'), @@ -250,8 +254,8 @@ def test_combine_two_num_conflicting_but_resolvable_headers_3(self): merged_headers = combiner_fn.add_input(merged_headers, headers_2) merged_headers = combiner_fn.extract_output(merged_headers) - self.assertItemsEqual(merged_headers.infos.keys(), ['NS']) - self.assertItemsEqual(merged_headers.infos['NS'], + self.assertCountEqual(list(merged_headers.infos.keys()), ['NS']) + self.assertCountEqual(merged_headers.infos['NS'], OrderedDict([('id', 'NS'), ('num', '.'), ('type', 'Integer'), diff --git a/gcp_variant_transforms/transforms/merge_variants.py b/gcp_variant_transforms/transforms/merge_variants.py index 14a47daa5..b58f3b5f4 100644 --- a/gcp_variant_transforms/transforms/merge_variants.py +++ b/gcp_variant_transforms/transforms/merge_variants.py @@ -14,7 +14,6 @@ """A PTransform for merging variants based on a strategy.""" -from __future__ import absolute_import import apache_beam as beam @@ -42,7 +41,8 @@ def _map_by_variant_keys(self, variant): for key in self._variant_merger.get_merge_keys(variant): yield (key, variant) - def _merge_variants_by_key(self, (key, variants)): + def _merge_variants_by_key(self, key_and_variants): + (key, variants) = key_and_variants return self._variant_merger.get_merged_variants(variants, key) def expand(self, pcoll): diff --git a/gcp_variant_transforms/transforms/merge_variants_test.py b/gcp_variant_transforms/transforms/merge_variants_test.py index f8235a751..bde5262f6 100644 --- a/gcp_variant_transforms/transforms/merge_variants_test.py +++ b/gcp_variant_transforms/transforms/merge_variants_test.py @@ -14,7 +14,6 @@ """Tests for merge_variants module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/sample_info_to_avro.py b/gcp_variant_transforms/transforms/sample_info_to_avro.py index ff0977c27..855db6881 100644 --- a/gcp_variant_transforms/transforms/sample_info_to_avro.py +++ b/gcp_variant_transforms/transforms/sample_info_to_avro.py @@ -17,7 +17,7 @@ import time import apache_beam as beam -import avro +import fastavro from gcp_variant_transforms.beam_io import vcf_header_io # pylint: disable=unused-import from gcp_variant_transforms.beam_io import vcf_parser @@ -38,7 +38,7 @@ def __init__(self, sample_name_encoding): self._sample_name_encoding = sample_name_encoding def _get_now_to_minute(self): - return int(time.time()) / _SECS_IN_MIN * _SECS_IN_MIN * _MICROS_IN_SEC + return int(time.time()) // _SECS_IN_MIN * _SECS_IN_MIN * _MICROS_IN_SEC def process(self, vcf_header): # type: (vcf_header_io.VcfHeader, bool) -> Dict[str, Union[int, str]] @@ -74,12 +74,12 @@ def __init__(self, output_path, sample_name_encoding): self._output_path = output_path self._sample_name_encoding = sample_name_encoding bq_schema = sample_info_table_schema_generator.generate_schema() - self._avro_schema = avro.schema.parse( - schema_converter.convert_table_schema_to_json_avro_schema(bq_schema)) + self._fastavro_schema = fastavro.parse_schema( + schema_converter.convert_schema_to_avro_dict(bq_schema)) def expand(self, pcoll): return (pcoll | 'ConvertSampleInfoToAvroTableRow' >> beam.ParDo( ConvertSampleInfoToRow(self._sample_name_encoding)) | 'WriteToAvroFiles' >> beam.io.WriteToAvro( - self._output_path, self._avro_schema)) + self._output_path, self._fastavro_schema)) diff --git a/gcp_variant_transforms/transforms/sample_mapping_table_test.py b/gcp_variant_transforms/transforms/sample_mapping_table_test.py index 5d2d0f242..5361a20e1 100644 --- a/gcp_variant_transforms/transforms/sample_mapping_table_test.py +++ b/gcp_variant_transforms/transforms/sample_mapping_table_test.py @@ -14,7 +14,6 @@ """Tests for densify_variants module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/shard_variants.py b/gcp_variant_transforms/transforms/shard_variants.py index dd0835fd3..9ca3e1666 100644 --- a/gcp_variant_transforms/transforms/shard_variants.py +++ b/gcp_variant_transforms/transforms/shard_variants.py @@ -14,7 +14,6 @@ """A PTransform for sharding variants based on their reference_name.""" -from __future__ import absolute_import import apache_beam as beam from gcp_variant_transforms.beam_io import vcfio # pylint: disable=unused-import diff --git a/gcp_variant_transforms/transforms/shard_variants_test.py b/gcp_variant_transforms/transforms/shard_variants_test.py index 63bacbc9c..0e1275483 100644 --- a/gcp_variant_transforms/transforms/shard_variants_test.py +++ b/gcp_variant_transforms/transforms/shard_variants_test.py @@ -14,7 +14,6 @@ """Tests for shard_variants module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/variant_to_avro.py b/gcp_variant_transforms/transforms/variant_to_avro.py index 7a6b8f0db..e98795147 100644 --- a/gcp_variant_transforms/transforms/variant_to_avro.py +++ b/gcp_variant_transforms/transforms/variant_to_avro.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import import apache_beam as beam -import avro +import fastavro from gcp_variant_transforms.beam_io import vcf_header_io # pylint: disable=unused-import from gcp_variant_transforms.libs import bigquery_row_generator @@ -59,8 +58,8 @@ def __init__( to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path - self._avro_schema = avro.schema.parse( - schema_converter.convert_table_schema_to_json_avro_schema(schema)) + self._fastavro_schema = fastavro.parse_schema( + schema_converter.convert_schema_to_avro_dict(schema)) self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(schema), @@ -79,4 +78,4 @@ def expand(self, pcoll): self._omit_empty_sample_calls)) return (avro_records | 'WriteToAvroFiles' >> - beam.io.WriteToAvro(self._output_path, self._avro_schema)) + beam.io.WriteToAvro(self._output_path, self._fastavro_schema)) diff --git a/gcp_variant_transforms/transforms/variant_to_bigquery.py b/gcp_variant_transforms/transforms/variant_to_bigquery.py index fe8e7aaa3..cf90b83ee 100644 --- a/gcp_variant_transforms/transforms/variant_to_bigquery.py +++ b/gcp_variant_transforms/transforms/variant_to_bigquery.py @@ -14,7 +14,6 @@ """A PTransform to output a PCollection of ``Variant`` records to BigQuery.""" -from __future__ import absolute_import import random from typing import Dict, List # pylint: disable=unused-import @@ -48,7 +47,7 @@ def __init__( omit_empty_sample_calls=False # type: bool ): # type: (...) -> None - super(ConvertVariantToRow, self).__init__() + super().__init__() self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._bigquery_row_generator = row_generator diff --git a/gcp_variant_transforms/transforms/variant_to_bigquery_test.py b/gcp_variant_transforms/transforms/variant_to_bigquery_test.py index c066b1b19..64af09c2d 100644 --- a/gcp_variant_transforms/transforms/variant_to_bigquery_test.py +++ b/gcp_variant_transforms/transforms/variant_to_bigquery_test.py @@ -14,7 +14,6 @@ """Tests for variant_to_bigquery module.""" -from __future__ import absolute_import import unittest diff --git a/gcp_variant_transforms/transforms/write_variants_to_shards.py b/gcp_variant_transforms/transforms/write_variants_to_shards.py index 9cb5ce294..dd979cda1 100644 --- a/gcp_variant_transforms/transforms/write_variants_to_shards.py +++ b/gcp_variant_transforms/transforms/write_variants_to_shards.py @@ -59,7 +59,7 @@ def process(self, variant, sample_names): # type: (vcfio.Variant, List[str]) -> None self._counter += 1 self._sample_names = sample_names - self._variant_lines.append(self._coder.encode(variant).strip('\n')) + self._variant_lines.append(self._coder.encode(variant).strip(b'\n')) if self._counter == self._number_of_variants_per_shard: self._write_variant_lines_to_vcf_shard(self._variant_lines) self._counter = 0 @@ -71,12 +71,13 @@ def _write_variant_lines_to_vcf_shard(self, variant_lines): vcf_fixed_columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] str_sample_names = [str(sample_name) for sample_name in self._sample_names] - vcf_header = str('\t'.join(vcf_fixed_columns + str_sample_names)) + vcf_header = str('\t'.join(vcf_fixed_columns + str_sample_names)).encode( + 'utf-8') vcf_data_file = self._generate_unique_file_path(len(variant_lines)) with filesystems.FileSystems.create(vcf_data_file) as file_to_write: file_to_write.write(vcf_header) for v in variant_lines: - file_to_write.write('\n') + file_to_write.write(b'\n') file_to_write.write(v) def _generate_unique_file_path(self, variants_num): diff --git a/gcp_variant_transforms/transforms/write_variants_to_shards_test.py b/gcp_variant_transforms/transforms/write_variants_to_shards_test.py index d36695625..241105f86 100644 --- a/gcp_variant_transforms/transforms/write_variants_to_shards_test.py +++ b/gcp_variant_transforms/transforms/write_variants_to_shards_test.py @@ -50,17 +50,17 @@ def test_write_to_shards(self): shards_writter = write_variants_to_shards._WriteVariantsToVCFShards( tempdir.get_path(), 3) variants = self._get_variants() - variant_lines = [shards_writter._coder.encode(v).strip('\n') + variant_lines = [shards_writter._coder.encode(v).strip(b'\n') for v in variants] shards_writter._write_variant_lines_to_vcf_shard(variant_lines) expected_content = [ '\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', - 'INFO', 'FORMAT\n']), + 'INFO', 'FORMAT\n']).encode('utf-8'), '\t'.join(['19', '12', 'rs1', 'C', 'A,TT', '2', 'PASS', - 'A1=some data;A2=data1,data2', '.\n']), + 'A1=some data;A2=data1,data2', '.\n']).encode('utf-8'), '\t'.join(['19', '12', 'rs1', 'C', 'A,TT', '20', 'q10', - 'A1=some data2;A3=data3,data4', '.'])] + 'A1=some data2;A3=data3,data4', '.']).encode('utf-8')] file_paths = [] for dirpath, _, filenames in os.walk(tempdir.get_path()): diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py index 328f747b4..6db353da4 100644 --- a/gcp_variant_transforms/vcf_to_bq.py +++ b/gcp_variant_transforms/vcf_to_bq.py @@ -31,7 +31,6 @@ --runner DataflowRunner """ -from __future__ import absolute_import import argparse # pylint: disable=unused-import from datetime import datetime @@ -385,7 +384,8 @@ def _validate_annotation_pipeline_args(known_args, pipeline_args): flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options() expected_flags = ['max_num_workers', 'num_workers'] for flag in expected_flags: - if flag in flags_dict and flags_dict[flag] > 0: + if (flag in flags_dict and + flags_dict[flag] is not None and flags_dict[flag] > 0): return raise ValueError('Could not find any of {} with a valid value among pipeline ' 'flags {}'.format(expected_flags, flags_dict)) @@ -416,10 +416,10 @@ def _write_schema_to_temp_file(schema, path): schema_json = schema_converter.convert_table_schema_to_json_bq_schema(schema) schema_file = tempfile.mkstemp(suffix=_BQ_SCHEMA_FILE_SUFFIX)[1] with filesystems.FileSystems.create(schema_file) as file_to_write: - file_to_write.write(schema_json) + file_to_write.write(schema_json.encode('utf-8')) gs_file_path = path + _BQ_SCHEMA_FILE_SUFFIX with filesystems.FileSystems.create(gs_file_path) as file_to_write: - file_to_write.write(schema_json) + file_to_write.write(schema_json.encode('utf-8')) return schema_file diff --git a/gcp_variant_transforms/vcf_to_bq_preprocess.py b/gcp_variant_transforms/vcf_to_bq_preprocess.py index c225f4e70..d3625fec2 100644 --- a/gcp_variant_transforms/vcf_to_bq_preprocess.py +++ b/gcp_variant_transforms/vcf_to_bq_preprocess.py @@ -46,7 +46,6 @@ --setup_file ./setup.py """ -from __future__ import absolute_import import logging import sys diff --git a/gcp_variant_transforms/vcf_to_bq_test.py b/gcp_variant_transforms/vcf_to_bq_test.py index deb7a8e7c..efa888abb 100644 --- a/gcp_variant_transforms/vcf_to_bq_test.py +++ b/gcp_variant_transforms/vcf_to_bq_test.py @@ -27,7 +27,8 @@ class VcfToBqTest(unittest.TestCase): """Tests cases for the ``vcf_to_bq`` script.""" def _create_mock_args(self, **args): - return collections.namedtuple('MockArgs', args.keys())(*args.values()) + return collections.namedtuple( + 'MockArgs', list(args.keys()))(*list(args.values())) def test_no_merge_strategy(self): args = self._create_mock_args( @@ -58,13 +59,13 @@ def test_valid_merge_strategy(self): def test_invalid_annotation_output_directory_raises_error(self): known_args = self._create_mock_args(annotation_output_dir='./*') pipeline_args = [] - with self.assertRaisesRegexp(ValueError, 'directory .* already exists'): + with self.assertRaisesRegex(ValueError, 'directory .* already exists'): vcf_to_bq._validate_annotation_pipeline_args(known_args, pipeline_args) def test_invalid_annotation_missing_flags_raises_error(self): known_args = self._create_mock_args(annotation_output_dir='dummy') pipeline_args = [] - with self.assertRaisesRegexp(ValueError, 'Could not .* pipeline flags'): + with self.assertRaisesRegex(ValueError, 'Could not .* pipeline flags'): vcf_to_bq._validate_annotation_pipeline_args(known_args, pipeline_args) def test_valid_annotation_flags(self): diff --git a/setup.py b/setup.py index c19a40ed3..4a3b82f4d 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ 'zlib1g-dev'] ] -PYSAM_INSTALLATION_COMMAND = ['pip', 'install', 'pysam<0.16.0'] +PYSAM_INSTALLATION_COMMAND = ['python3', '-m', 'pip', 'install', 'pysam<0.16.0'] REQUIRED_PACKAGES = [ 'cython>=0.28.1', @@ -38,7 +38,7 @@ 'google-api-python-client>=1.6,<1.7.12', 'intervaltree>=2.1.0,<2.2.0', 'mmh3<2.6', - 'google-cloud-storage', + 'google-cloud-storage<1.30.0', 'pyfarmhash', 'pyyaml' ] @@ -57,11 +57,12 @@ def finalize_options(self): pass def RunCustomCommand(self, command_list): - print 'Running command: %s' % command_list + print('Running command: %s' % command_list) try: subprocess.call(command_list) except Exception as e: - raise RuntimeError('Command %s failed with error: %s' % (command_list, e)) + raise RuntimeError( + 'Command %s failed with error: %s' % (command_list, e)) from e def run(self): try: @@ -71,11 +72,12 @@ def run(self): self.RunCustomCommand(command) self.RunCustomCommand(PYSAM_INSTALLATION_COMMAND) - except RuntimeError: + except RuntimeError as e: raise RuntimeError( 'PySam installation has failed. Make sure you have the ' + \ 'following packages installed: autoconf automake gcc libbz2-dev ' + \ - 'liblzma-dev libcurl4-openssl-dev libssl-dev make perl zlib1g-dev') + 'liblzma-dev libcurl4-openssl-dev libssl-dev make perl ' + \ + 'zlib1g-dev') from e class build(_build): # pylint: disable=invalid-name """A build command class that will be invoked during package install. @@ -104,8 +106,8 @@ class build(_build): # pylint: disable=invalid-name 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: System :: Distributed Computing', 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', ], setup_requires=REQUIRED_SETUP_PACKAGES,