diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bf6c55..5723201 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.9.0 + +- Added support to parse .Rdata/.rda files. +- Bump the version of rds2cpp library. + ## Version 0.8.0 - 0.8.1 - Implement parsers for compressed list objects. diff --git a/README.md b/README.md index 44c3f66..74f419a 100644 --- a/README.md +++ b/README.md @@ -4,18 +4,7 @@ # rds2py -Parse and construct Python representations for datasets stored in RDS files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** - ---- - -**Version 0.5.0** brings major changes to the package, - -- Complete overhaul of the codebase using pybind11 -- Streamlined readers for R data types -- Updated API for all classes and methods - -Please refer to the [documentation](https://biocpy.github.io/rds2py/) for the latest usage guidelines. Previous versions may have incompatible APIs. - +Parse and construct Python representations for datasets stored in **RDS or RData** files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** ## Installation @@ -32,36 +21,29 @@ By default, the package does not install packages to convert python representati ## Usage -If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). +> [!NOTE] +> +> If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). ```python -from rds2py import read_rds -r_obj = read_rds("path/to/file.rds") +from rds2py import read_rds, read_rda +r_obj = read_rds("path/to/file.rds") # or read_rda("path/to/file.rda") ``` The returned `r_obj` either returns an appropriate Python class if a parser is already implemented or returns the dictionary containing the data from the RDS file. -To just get the parsed dictionary representation of the RDS file, - -```python -from rds2py import parse_rds - -robject_dict = parse_rds("path/to/file.rds") -print(robject_dict) -``` - ### Write-your-own-reader -Reading RDS files as dictionary representations allows users to write their own custom readers into appropriate Python representations. +Reading RDS or RData files as dictionary representations allows users to write their own custom readers into appropriate Python representations. ```python -from rds2py import parse_rds +from rds2py import parse_rds, parse_rda -robject = parse_rds("path/to/file.rds") +robject = parse_rds("path/to/file.rds") # or use parse_rda for rdata files print(robject) ``` -if you know this RDS file contains an `GenomicRanges` object, you can use the built-in reader or write your own reader to convert this dictionary. +If you know this RDS file contains an `GenomicRanges` object, you can use the built-in reader or write your own reader to convert this dictionary. ```python from rds2py.read_granges import read_genomic_ranges diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 96a7421..14470b7 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -11,16 +11,23 @@ include(FetchContent) FetchContent_Declare( rds2cpp GIT_REPOSITORY https://github.com/LTLA/rds2cpp - GIT_TAG v1.1.0 + GIT_TAG master ) FetchContent_Declare( byteme GIT_REPOSITORY https://github.com/LTLA/byteme - GIT_TAG v1.2.2 + GIT_TAG master +) + +FetchContent_Declare( + sanisizer + GIT_REPOSITORY https://github.com/LTLA/sanisizer + GIT_TAG master ) FetchContent_MakeAvailable(byteme) +FetchContent_MakeAvailable(sanisizer) FetchContent_MakeAvailable(rds2cpp) # Defining the targets. diff --git a/lib/src/rdswrapper.cpp b/lib/src/rdswrapper.cpp index 1b52da2..2ed6aa2 100644 --- a/lib/src/rdswrapper.cpp +++ b/lib/src/rdswrapper.cpp @@ -148,7 +148,8 @@ class RdsObject { public: RdsObject(const std::string& file) { try { - parsed = std::make_unique(rds2cpp::parse_rds(file)); + rds2cpp::ParseRdsOptions options; + parsed = std::make_unique(rds2cpp::parse_rds(file, options)); if (!parsed || !parsed->object) { throw std::runtime_error("Failed to parse RDS file"); } @@ -164,6 +165,60 @@ class RdsObject { } }; +class RdaObject { +private: + std::unique_ptr parsed; + +public: + RdaObject(const std::string& file) { + try { + rds2cpp::ParseRdaOptions options; + parsed = std::make_unique(rds2cpp::parse_rda(file, options)); + } catch (const std::exception& e) { + throw std::runtime_error(std::string("Error in 'RdaObject' constructor: ") + e.what()); + } + } + + py::list get_object_names() const { + if (!parsed) throw std::runtime_error("Null parsed in 'get_object_names'"); + const auto& pairlist = parsed->contents; + py::list names; + for (size_t i = 0; i < pairlist.tag_names.size(); ++i) { + if (pairlist.has_tag[i]) { + names.append(pairlist.tag_names[i]); + } else { + names.append(py::none()); + } + } + return names; + } + + int get_object_count() const { + if (!parsed) throw std::runtime_error("Null parsed in 'get_object_count'"); + return static_cast(parsed->contents.data.size()); + } + + RdsReader* get_object_by_index(int index) const { + if (!parsed) throw std::runtime_error("Null parsed in 'get_object_by_index'"); + const auto& data = parsed->contents.data; + if (index < 0 || static_cast(index) >= data.size()) { + throw std::out_of_range("Object index out of range"); + } + return new RdsReader(data[index].get()); + } + + RdsReader* get_object_by_name(const std::string& name) const { + if (!parsed) throw std::runtime_error("Null parsed in 'get_object_by_name'"); + const auto& pairlist = parsed->contents; + for (size_t i = 0; i < pairlist.tag_names.size(); ++i) { + if (pairlist.has_tag[i] && pairlist.tag_names[i] == name) { + return new RdsReader(pairlist.data[i].get()); + } + } + throw std::runtime_error("Object not found: " + name); + } +}; + PYBIND11_MODULE(lib_rds_parser, m) { py::register_exception(m, "RdsParserError"); @@ -171,6 +226,13 @@ PYBIND11_MODULE(lib_rds_parser, m) { .def(py::init()) .def("get_robject", &RdsObject::get_robject, py::return_value_policy::reference_internal); + py::class_(m, "RdaObject") + .def(py::init()) + .def("get_object_names", &RdaObject::get_object_names) + .def("get_object_count", &RdaObject::get_object_count) + .def("get_object_by_index", &RdaObject::get_object_by_index, py::return_value_policy::take_ownership, py::keep_alive<0, 1>()) + .def("get_object_by_name", &RdaObject::get_object_by_name, py::return_value_policy::take_ownership, py::keep_alive<0, 1>()); + py::class_(m, "RdsReader") .def(py::init()) .def("get_rtype", &RdsReader::get_rtype) diff --git a/setup.cfg b/setup.cfg index b853bae..c7dbc2a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -62,7 +62,6 @@ exclude = # `pip install rds2py[PDF]` like: # PDF = ReportLab; RXP optional = - pandas hdf5array scipy biocframe @@ -72,6 +71,7 @@ optional = multiassayexperiment>=0.6.0 compressed_lists>=0.4.4 biocutils>=0.3.4 + compressed_lists # Add here test requirements (semicolon/line-separated) testing = diff --git a/src/rds2py/PyRdaReader.py b/src/rds2py/PyRdaReader.py new file mode 100644 index 0000000..ad8d266 --- /dev/null +++ b/src/rds2py/PyRdaReader.py @@ -0,0 +1,111 @@ +"""Low-level interface for reading RData files. + +This module provides the core functionality for parsing RData (.RData/.rda) files +and converting them into dictionary representations that can be further processed +by higher-level functions. +""" + +from typing import Any, Dict + +from .lib_rds_parser import RdaObject, RdsReader +from .PyRdsReader import PyRdsParser + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +class PyRdaParserError(Exception): + """Exception raised for errors during RData parsing.""" + + pass + + +class PyRdaParser: + """Parser for reading RData files. + + This class provides low-level access to RData file contents, handling the binary + format and converting it into Python data structures. It reuses the same + ``RdsReader``-based object processing from :py:class:`~.PyRdsParser`. + + Attributes: + rda_object: + Internal representation of the RData file. + """ + + def __init__(self, file_path: str): + """Initialize the parser. + + Args: + file_path: + Path to the RData file to be read. + """ + try: + self.rda_object = RdaObject(file_path) + except Exception as e: + raise PyRdaParserError(f"Error initializing 'PyRdaParser': {str(e)}") + + def get_object_names(self): + """Get the names of all objects stored in the RData file. + + Returns: + A list of object names (strings). + """ + return list(self.rda_object.get_object_names()) + + def get_object_count(self) -> int: + """Get the number of objects stored in the RData file. + + Returns: + Number of objects. + """ + return self.rda_object.get_object_count() + + def parse(self) -> Dict[str, Dict[str, Any]]: + """Parse all objects in the RData file. + + Returns: + A dictionary mapping object names to their parsed representations. + Each value has the same structure as the output of + :py:meth:`~rds2py.PyRdsReader.PyRdsParser.parse`. + """ + try: + helper = _RdsProcessorHelper() + + result = {} + names = self.get_object_names() + for i, name in enumerate(names): + reader = self.rda_object.get_object_by_index(i) + key = name if name is not None else f"__unnamed_{i}" + result[key] = helper._process_object(reader) + + return result + except Exception as e: + raise PyRdaParserError(f"Error parsing RData file: {str(e)}") + + def parse_object(self, name: str) -> Dict[str, Any]: + """Parse a single named object from the RData file. + + Args: + name: + Name of the object to parse. + + Returns: + A dictionary containing the parsed data for the requested object. + """ + try: + helper = _RdsProcessorHelper() + reader = self.rda_object.get_object_by_name(name) + return helper._process_object(reader) + except Exception as e: + raise PyRdaParserError(f"Error parsing object '{name}': {str(e)}") + + +class _RdsProcessorHelper(PyRdsParser): + """Helper that reuses PyRdsParser's object processing without requiring a file.""" + + def __init__(self): + self.R_MIN = -2147483648 + + def _process_object(self, obj: RdsReader) -> Dict[str, Any]: + return super()._process_object(obj) diff --git a/src/rds2py/__init__.py b/src/rds2py/__init__.py index f64e9e8..58dee0f 100644 --- a/src/rds2py/__init__.py +++ b/src/rds2py/__init__.py @@ -15,5 +15,6 @@ finally: del version, PackageNotFoundError -from .generics import read_rds -from .rdsutils import parse_rds + +from .generics import read_rds, read_rda +from .rdsutils import parse_rds, parse_rda diff --git a/src/rds2py/generics.py b/src/rds2py/generics.py index 50501e5..a448f26 100644 --- a/src/rds2py/generics.py +++ b/src/rds2py/generics.py @@ -16,9 +16,10 @@ """ from importlib import import_module +from typing import List, Optional from warnings import warn -from .rdsutils import get_class, parse_rds +from .rdsutils import get_class, parse_rda, parse_rds __author__ = "jkanche" __copyright__ = "jkanche" @@ -105,6 +106,34 @@ def read_rds(path: str, **kwargs): return _dispatcher(_robj, **kwargs) +def read_rda(path: str, objects: Optional[List[str]] = None, **kwargs) -> dict: + """Read an RData file and convert each object to an appropriate Python type. + + This function parses all (or selected) objects and dispatches each one + through the same type registry used by :py:func:`~.read_rds`. + + Args: + path: + Path to the RData (.RData/.rda) file to be read. + + objects: + Optional list of object names to read. If ``None``, + all objects in the file are read. + + **kwargs: + Additional arguments passed to specific parser functions. + + Returns: + A dictionary mapping object names to their converted Python + representations. + """ + parsed = parse_rda(path=path, objects=objects) + result = {} + for name, robj in parsed.items(): + result[name] = _dispatcher(robj, **kwargs) + return result + + def _dispatcher(robject: dict, **kwargs): """Internal function to dispatch R objects to appropriate parser functions. diff --git a/src/rds2py/rdsutils.py b/src/rds2py/rdsutils.py index 82f52a3..2359b43 100644 --- a/src/rds2py/rdsutils.py +++ b/src/rds2py/rdsutils.py @@ -1,9 +1,12 @@ -"""Utility functions for RDS file parsing and class inference. +"""Utility functions for RDS/RData file parsing and class inference. -This module provides helper functions for parsing RDS files and inferring the appropriate R class information from -parsed objects. +This module provides helper functions for parsing RDS and RData files and inferring the appropriate R class +information from parsed objects. """ +from typing import Dict, List, Optional + +from .PyRdaReader import PyRdaParser from .PyRdsReader import PyRdsParser __author__ = "jkanche" @@ -28,6 +31,32 @@ def parse_rds(path: str) -> dict: return realized +def parse_rda(path: str, objects: Optional[List[str]] = None) -> Dict[str, dict]: + """Parse an RData file into a dictionary of named objects. + + Args: + path: + Path to the RData (.RData/.rda) file to be parsed. + + objects: + Optional list of object names to parse. If ``None``, + all objects in the file are parsed. + + Returns: + A dictionary mapping object names to their parsed representations. + Each value has the same structure as the output of :py:func:`~.parse_rds`. + """ + parser = PyRdaParser(path) + + if objects is None: + return parser.parse() + + result = {} + for name in objects: + result[name] = parser.parse_object(name) + return result + + def get_class(robj: dict) -> str: """Infer the R class name from a parsed RDS object. diff --git a/tests/data/dataframe.RData b/tests/data/dataframe.RData new file mode 100644 index 0000000..58ea284 Binary files /dev/null and b/tests/data/dataframe.RData differ diff --git a/tests/data/generate_rdata.R b/tests/data/generate_rdata.R new file mode 100644 index 0000000..7bcf489 --- /dev/null +++ b/tests/data/generate_rdata.R @@ -0,0 +1,36 @@ +set.seed(42) + +# Simple RData with basic types +int_vec <- as.integer(c(1, 2, 3, 4, 5)) +dbl_vec <- c(1.1, 2.2, 3.3, 4.4, 5.5) +str_vec <- c("hello", "world", "foo") +bool_vec <- c(TRUE, FALSE, TRUE, TRUE, FALSE) +save(int_vec, dbl_vec, str_vec, bool_vec, file = "simple.RData") + +# Single object RData +single_obj <- as.integer(1:10) +save(single_obj, file = "single_object.RData") + +# RData with a data.frame +test_df <- data.frame( + x = 1:5, + y = c(1.1, 2.2, 3.3, 4.4, 5.5), + z = c("a", "b", "c", "d", "e"), + stringsAsFactors = FALSE +) +save(test_df, file = "dataframe.RData") + +# RData with a list +test_list <- list(a = 1:3, b = c("x", "y"), c = TRUE) +save(test_list, file = "list.RData") + +# RData with a matrix +test_matrix <- matrix(1:12, nrow = 3, ncol = 4) +save(test_matrix, file = "matrix.RData") + +# RData with multiple mixed types +nums <- c(10.0, 20.0, 30.0) +chars <- LETTERS[1:5] +ints <- as.integer(c(100, 200, 300)) +nested_list <- list(alpha = 1:3, beta = c("a", "b")) +save(nums, chars, ints, nested_list, file = "mixed.RData") diff --git a/tests/data/list.RData b/tests/data/list.RData new file mode 100644 index 0000000..ffdfaf9 Binary files /dev/null and b/tests/data/list.RData differ diff --git a/tests/data/matrix.RData b/tests/data/matrix.RData new file mode 100644 index 0000000..c777284 Binary files /dev/null and b/tests/data/matrix.RData differ diff --git a/tests/data/mixed.RData b/tests/data/mixed.RData new file mode 100644 index 0000000..80aef0f Binary files /dev/null and b/tests/data/mixed.RData differ diff --git a/tests/data/simple.RData b/tests/data/simple.RData new file mode 100644 index 0000000..b931716 Binary files /dev/null and b/tests/data/simple.RData differ diff --git a/tests/data/single_object.RData b/tests/data/single_object.RData new file mode 100644 index 0000000..acebc56 Binary files /dev/null and b/tests/data/single_object.RData differ diff --git a/tests/test_rdata.py b/tests/test_rdata.py new file mode 100644 index 0000000..3077866 --- /dev/null +++ b/tests/test_rdata.py @@ -0,0 +1,129 @@ +import numpy as np +import pytest + +from rds2py import parse_rda, read_rda +from rds2py.PyRdaReader import PyRdaParser, PyRdaParserError + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +class TestPyRdaParser: + def test_object_names(self): + parser = PyRdaParser("tests/data/simple.RData") + names = parser.get_object_names() + assert set(names) == {"int_vec", "dbl_vec", "str_vec", "bool_vec"} + + def test_object_count(self): + parser = PyRdaParser("tests/data/simple.RData") + assert parser.get_object_count() == 4 + + def test_parse_all(self): + parser = PyRdaParser("tests/data/simple.RData") + result = parser.parse() + assert isinstance(result, dict) + assert set(result.keys()) == {"int_vec", "dbl_vec", "str_vec", "bool_vec"} + + def test_parse_single_object(self): + parser = PyRdaParser("tests/data/simple.RData") + obj = parser.parse_object("int_vec") + assert obj["type"] == "integer" + + def test_parse_missing_object(self): + parser = PyRdaParser("tests/data/simple.RData") + with pytest.raises(PyRdaParserError): + parser.parse_object("nonexistent") + + def test_invalid_file(self): + with pytest.raises(PyRdaParserError): + PyRdaParser("tests/data/nonexistent.RData") + + def test_single_object_file(self): + parser = PyRdaParser("tests/data/single_object.RData") + names = parser.get_object_names() + assert names == ["single_obj"] + assert parser.get_object_count() == 1 + + +class TestParseRda: + def test_parse_all_objects(self): + result = parse_rda("tests/data/simple.RData") + assert isinstance(result, dict) + assert "int_vec" in result + assert "dbl_vec" in result + assert "str_vec" in result + assert "bool_vec" in result + + def test_parse_selected_objects(self): + result = parse_rda("tests/data/simple.RData", objects=["int_vec", "dbl_vec"]) + assert set(result.keys()) == {"int_vec", "dbl_vec"} + + def test_integer_data(self): + result = parse_rda("tests/data/simple.RData", objects=["int_vec"]) + obj = result["int_vec"] + assert obj["type"] == "integer" + data = obj["data"] + np.testing.assert_array_equal(data, [1, 2, 3, 4, 5]) + + def test_double_data(self): + result = parse_rda("tests/data/simple.RData", objects=["dbl_vec"]) + obj = result["dbl_vec"] + assert obj["type"] == "double" + data = obj["data"] + np.testing.assert_array_almost_equal(data, [1.1, 2.2, 3.3, 4.4, 5.5]) + + def test_string_data(self): + result = parse_rda("tests/data/simple.RData", objects=["str_vec"]) + obj = result["str_vec"] + assert obj["type"] == "string" + assert list(obj["data"]) == ["hello", "world", "foo"] + + def test_boolean_data(self): + result = parse_rda("tests/data/simple.RData", objects=["bool_vec"]) + obj = result["bool_vec"] + assert obj["type"] == "boolean" + + def test_single_object_file(self): + result = parse_rda("tests/data/single_object.RData") + assert "single_obj" in result + obj = result["single_obj"] + assert obj["type"] == "integer" + + def test_mixed_types(self): + result = parse_rda("tests/data/mixed.RData") + assert "nums" in result + assert "chars" in result + assert "ints" in result + assert "nested_list" in result + + assert result["nums"]["type"] == "double" + assert result["chars"]["type"] == "string" + assert result["ints"]["type"] == "integer" + assert result["nested_list"]["type"] == "vector" + + +class TestReadRda: + def test_read_all(self): + result = read_rda("tests/data/simple.RData") + assert isinstance(result, dict) + assert len(result) == 4 + + def test_read_selected(self): + result = read_rda("tests/data/simple.RData", objects=["int_vec"]) + assert set(result.keys()) == {"int_vec"} + + def test_read_list(self): + result = read_rda("tests/data/list.RData") + assert "test_list" in result + + def test_read_mixed(self): + result = read_rda("tests/data/mixed.RData") + assert "nums" in result + assert "chars" in result + assert "ints" in result + assert "nested_list" in result + + def test_read_single_object(self): + result = read_rda("tests/data/single_object.RData") + assert "single_obj" in result