Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

TwoBit.jl provides I/O and utilities for manipulating 2Bit sequence data files.

2Bit is a binary file format designed for storing a genome consists of multiple chromosomal sequences.
2Bit is a binary file format (described [here](https://genome.ucsc.edu/FAQ/FAQformat.html#format7)) designed for storing a genome that consists of multiple chromosomal sequences.

The reading speed is often an order of magnitude faster than that of FASTA and the file size is smaller.

Expand Down
16 changes: 9 additions & 7 deletions src/writer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ mutable struct Writer{T<:IO} <: BioGenerics.IO.AbstractWriter
end

"""
TwoBitWriter(output::IO, names::AbstractVector)
TwoBitWriter(output::IO, names::AbstractVector; collapse_ambiguous = false)

Create a data writer of the 2bit file format.

# Arguments
* `output`: data sink
* `names`: a vector of sequence names written to `output`
* `collapse_ambiguous`: Set to `true` if your input sequence has partial ambiguities (W/S/R/Y, etc.) and you can
tolerate converting them to `N`. The format does not support partial ambiguities and will error otherwise.
"""
function Writer(output::IO, names::AbstractVector)
writer = Writer(output, names, falses(length(names)))
Expand Down Expand Up @@ -73,7 +75,7 @@ function update_offset(writer::Writer, seqname, seqoffset)
end
end

function Base.write(writer::Writer, record::WriteRecord)
function Base.write(writer::Writer, record::WriteRecord; collapse_ambiguous = false)
i = findfirst(isequal(record.name), writer.names)
if i == 0
error("sequence \"", record.name, "\" doesn't exist in the writing list")
Expand All @@ -86,7 +88,7 @@ function Base.write(writer::Writer, record::WriteRecord)

n = 0
n += write(output, UInt32(length(record.seq)))
n += write_n_blocks(output, record.seq)
n += write_n_blocks(output, record.seq; collapse_ambiguous)
n += write_masked_blocks(output, record.masks)
n += write(output, UInt32(0)) # reserved bytes
n += write_twobit_sequence(output, record.seq)
Expand All @@ -95,13 +97,13 @@ function Base.write(writer::Writer, record::WriteRecord)
return n
end

function make_n_blocks(seq)
function make_n_blocks(seq; collapse_ambiguous = false)
starts = UInt32[]
sizes = UInt32[]
i = 1
while i ≀ lastindex(seq)
nt = seq[i]
if nt == BioSequences.DNA_N
if nt == BioSequences.DNA_N || (collapse_ambiguous && BioSequences.isambiguous(nt))
start = i - 1 # 0-based index
push!(starts, start)
while i ≀ lastindex(seq) && seq[i] == BioSequences.DNA_N
Expand All @@ -117,8 +119,8 @@ function make_n_blocks(seq)
return starts, sizes
end

function write_n_blocks(output, seq)
blockstarts, blocksizes = make_n_blocks(seq)
function write_n_blocks(output, seq; collapse_ambiguous = false)
blockstarts, blocksizes = make_n_blocks(seq; collapse_ambiguous)
@assert length(blockstarts) == length(blocksizes)
n = 0
n += write(output, UInt32(length(blockstarts)))
Expand Down
Loading