Flexible BEDPE generation
The first step is to create an viola.Vcf object.
In [1]: import viola
In [2]: vcf = viola.read_vcf('https://raw.githubusercontent.com/dermasugita/ViolaDocs/main/docs/html/_static/tutorial.manta.vcf', patient_name='manta1')
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
Cell In [2], line 1
----> 1 vcf = viola.read_vcf('https://raw.githubusercontent.com/dermasugita/ViolaDocs/main/docs/html/_static/tutorial.manta.vcf', patient_name='manta1')
File ~/checkouts/readthedocs.org/user_builds/viola-sv/checkouts/latest/src/viola/io/parser.py:569, in read_vcf(filepath_or_buffer, variant_caller, patient_name)
567 # read vcf files using PyVcf package
568 if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
--> 569 b = StringIO(urllib.request.urlopen(filepath_or_buffer).read().decode('utf-8'))
570 vcf_reader = vcf.Reader(b)
571 elif isinstance(filepath_or_buffer, str):
File ~/.asdf/installs/python/3.9.13/lib/python3.9/urllib/request.py:214, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
212 else:
213 opener = _opener
--> 214 return opener.open(url, data, timeout)
File ~/.asdf/installs/python/3.9.13/lib/python3.9/urllib/request.py:523, in OpenerDirector.open(self, fullurl, data, timeout)
521 for processor in self.process_response.get(protocol, []):
522 meth = getattr(processor, meth_name)
--> 523 response = meth(req, response)
525 return response
File ~/.asdf/installs/python/3.9.13/lib/python3.9/urllib/request.py:632, in HTTPErrorProcessor.http_response(self, request, response)
629 # According to RFC 2616, "2xx" code indicates that the client's
630 # request was successfully received, understood, and accepted.
631 if not (200 <= code < 300):
--> 632 response = self.parent.error(
633 'http', request, response, code, msg, hdrs)
635 return response
File ~/.asdf/installs/python/3.9.13/lib/python3.9/urllib/request.py:561, in OpenerDirector.error(self, proto, *args)
559 if http_err:
560 args = (dict, 'default', 'http_error_default') + orig_args
--> 561 return self._call_chain(*args)
File ~/.asdf/installs/python/3.9.13/lib/python3.9/urllib/request.py:494, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
492 for handler in handlers:
493 func = getattr(handler, meth_name)
--> 494 result = func(*args)
495 if result is not None:
496 return result
File ~/.asdf/installs/python/3.9.13/lib/python3.9/urllib/request.py:641, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
640 def http_error_default(self, req, fp, code, msg, hdrs):
--> 641 raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found
Then use to_bedpe_like method to generate bedpe-formatted pandas DataFrame.
In [3]: vcf_bedpe_like = vcf.to_bedpe_like()
In [4]: print(vcf_bedpe_like)
chrom1 start1 end1 chrom2 start2 end2 name score strand1 strand2
0 chr1 82550460 82550461 chr1 82554225 82554226 test1 None + -
1 chr1 22814216 22814217 chr1 92581131 92581132 test2 None - -
2 chr1 60567905 60567906 chr1 60675940 60675941 test3 None + -
3 chr1 69583189 69583190 chr1 69590947 69590948 test4 None + -
4 chr11 104534876 104534877 chr11 104536573 104536574 test5 None + -
5 chr11 111134696 111134697 chr17 26470494 26470495 test6_1 None + -
6 chr17 26470494 26470495 chr11 111134696 111134697 test6_2 None - +
If you want to add “SVLEN” and “CIPOS” fields, run as follows:
In [5]: vcf_bedpe_like_with_info = vcf.to_bedpe_like(custom_infonames=['svlen', 'cipos'])
In [6]: print(vcf_bedpe_like_with_info)
chrom1 start1 end1 chrom2 start2 ... strand1 strand2 svlen_0 cipos_0 cipos_1
0 chr1 82550460 82550461 chr1 82554225 ... + - -3764 -51 52
1 chr1 22814216 22814217 chr1 92581131 ... - - 69766915 -51 51
2 chr1 60567905 60567906 chr1 60675940 ... + - -108034 -44 44
3 chr1 69583189 69583190 chr1 69590947 ... + - -7757 -123 123
4 chr11 104534876 104534877 chr11 104536573 ... + - -1696 -68 69
5 chr11 111134696 111134697 chr17 26470494 ... + - 0 -118 118
6 chr17 26470494 26470495 chr11 111134696 ... - + 0 -81 82
[7 rows x 13 columns]
To add FORMAT, set add_formats = True:
In [7]: vcf_bedpe_like_with_format = vcf.to_bedpe_like(add_formats=True)
In [8]: print(vcf_bedpe_like_with_format)
chrom1 start1 end1 chrom2 start2 ... sample1_N_SR_1 sample1_T_PR_0 sample1_T_PR_1 sample1_T_SR_0 sample1_T_SR_1
0 chr1 82550460 82550461 chr1 82554225 ... 0.0 43.0 4.0 15.0 3.0
1 chr1 22814216 22814217 chr1 92581131 ... NaN 35.0 5.0 NaN NaN
2 chr1 60567905 60567906 chr1 60675940 ... NaN 44.0 6.0 NaN NaN
3 chr1 69583189 69583190 chr1 69590947 ... NaN 20.0 12.0 NaN NaN
4 chr11 104534876 104534877 chr11 104536573 ... NaN 57.0 14.0 NaN NaN
5 chr11 111134696 111134697 chr17 26470494 ... NaN 45.0 5.0 NaN NaN
6 chr17 26470494 26470495 chr11 111134696 ... NaN 45.0 5.0 NaN NaN
[7 rows x 18 columns]
Do the same thing for adding FILTER:
In [9]: vcf_bedpe_like_with_filter = vcf.to_bedpe_like(add_filters=True)
In [10]: print(vcf_bedpe_like_with_filter)
chrom1 start1 end1 chrom2 start2 ... score strand1 strand2 MinSomaticScore PASS
0 chr1 82550460 82550461 chr1 82554225 ... None + - True False
1 chr1 22814216 22814217 chr1 92581131 ... None - - True False
2 chr1 60567905 60567906 chr1 60675940 ... None + - True False
3 chr1 69583189 69583190 chr1 69590947 ... None + - False True
4 chr11 104534876 104534877 chr11 104536573 ... None + - False True
5 chr11 111134696 111134697 chr17 26470494 ... None + - True False
6 chr17 26470494 26470495 chr11 111134696 ... None - + True False
[7 rows x 12 columns]