From cd99d0295952fa69decf1a148f307f7cfa8bc10f Mon Sep 17 00:00:00 2001 From: Nicholas Bauer Date: Mon, 30 Jun 2025 15:32:09 -0400 Subject: [PATCH 1/2] Add to docs too --- docs/src/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/index.md b/docs/src/index.md index 46126aa..3be8f24 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -10,7 +10,7 @@ TwoBit.jl provides I/O and utilities for manipulating 2Bit sequence data files. -2Bit is a binary file format designed for storing a genome consists of multiple chromosomal sequences. +2Bit is a binary file format (described [here](https://genome.ucsc.edu/FAQ/FAQformat.html#format7)) designed for storing a genome that consists of multiple chromosomal sequences. The reading speed is often an order of magnitude faster than that of FASTA and the file size is smaller. From 6434765f8845c5928250b34cc2172989ec429011 Mon Sep 17 00:00:00 2001 From: Nicholas Bauer Date: Mon, 30 Jun 2025 16:22:04 -0400 Subject: [PATCH 2/2] Add switch to allow partial ambiguity to be processed --- src/writer.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 33acf7e..6cff8d1 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -11,13 +11,15 @@ mutable struct Writer{T<:IO} <: BioGenerics.IO.AbstractWriter end """ - TwoBitWriter(output::IO, names::AbstractVector) + TwoBitWriter(output::IO, names::AbstractVector; collapse_ambiguous = false) Create a data writer of the 2bit file format. # Arguments * `output`: data sink * `names`: a vector of sequence names written to `output` +* `collapse_ambiguous`: Set to `true` if your input sequence has partial ambiguities (W/S/R/Y, etc.) and you can +tolerate converting them to `N`. The format does not support partial ambiguities and will error otherwise. """ function Writer(output::IO, names::AbstractVector) writer = Writer(output, names, falses(length(names))) @@ -73,7 +75,7 @@ function update_offset(writer::Writer, seqname, seqoffset) end end -function Base.write(writer::Writer, record::WriteRecord) +function Base.write(writer::Writer, record::WriteRecord; collapse_ambiguous = false) i = findfirst(isequal(record.name), writer.names) if i == 0 error("sequence \"", record.name, "\" doesn't exist in the writing list") @@ -86,7 +88,7 @@ function Base.write(writer::Writer, record::WriteRecord) n = 0 n += write(output, UInt32(length(record.seq))) - n += write_n_blocks(output, record.seq) + n += write_n_blocks(output, record.seq; collapse_ambiguous) n += write_masked_blocks(output, record.masks) n += write(output, UInt32(0)) # reserved bytes n += write_twobit_sequence(output, record.seq) @@ -95,13 +97,13 @@ function Base.write(writer::Writer, record::WriteRecord) return n end -function make_n_blocks(seq) +function make_n_blocks(seq; collapse_ambiguous = false) starts = UInt32[] sizes = UInt32[] i = 1 while i ≤ lastindex(seq) nt = seq[i] - if nt == BioSequences.DNA_N + if nt == BioSequences.DNA_N || (collapse_ambiguous && BioSequences.isambiguous(nt)) start = i - 1 # 0-based index push!(starts, start) while i ≤ lastindex(seq) && seq[i] == BioSequences.DNA_N @@ -117,8 +119,8 @@ function make_n_blocks(seq) return starts, sizes end -function write_n_blocks(output, seq) - blockstarts, blocksizes = make_n_blocks(seq) +function write_n_blocks(output, seq; collapse_ambiguous = false) + blockstarts, blocksizes = make_n_blocks(seq; collapse_ambiguous) @assert length(blockstarts) == length(blocksizes) n = 0 n += write(output, UInt32(length(blockstarts)))