wip: GPU operations integrations with OpenCL

WrldEngine · WrldEngine · commit 294d3c56b9c6 · 2025-07-13T08:10:56.000+05:00
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -36,6 +36,11 @@ jobs:
           - runner: windows-latest
             target: x86
     steps:
+      - name: Installing OpenCL
+        run: |
+          sudo apt update
+          sudo apt install ocl-icd-opencl-dev
+
       - uses: actions/checkout@v4
       - name: Testing Rust modules
         run: |
@@ -156,6 +161,11 @@ jobs:
       contents: write
       attestations: write
     steps:
+      - name: Installing OpenCL
+        run: |
+          sudo apt update
+          sudo apt install ocl-icd-opencl-dev
+
       - uses: actions/download-artifact@v4
       - name: Generate artifact attestation
         uses: actions/attest-build-provenance@v2
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rem_math"
-version = "0.1.7"
+version = "0.2.7"
 edition = "2021"
 
 [lib]
@@ -19,6 +19,7 @@ crate-type = ["cdylib", "rlib"]
 numpy = "0.25.0"
 pyo3 = { version = "0.25.1", features = ["extension-module"] }
 rayon = "1.10.0"
+ocl = "0.19"
 
 [dev-dependencies]
 criterion = "0.3"
diff --git a/benches/common.rs b/benches/common.rs
@@ -72,6 +72,13 @@ fn sum_two_ints32_with_mthreaded_benchmark(c: &mut Criterion) {
     );
 }
 
+fn sum_two_ints32_with_gpu_benchmark(c: &mut Criterion) {
+    let arr = black_box(vec![1; NUM_ITERATIONS]);
+    c.bench_function("Array accumulation of two integer arrays with GPU", |b| {
+        b.iter(|| sum_two_ints32(&arr, &arr, "GPU"))
+    });
+}
+
 fn mul_two_ints32_benchmark(c: &mut Criterion) {
     let arr = black_box(vec![1; NUM_ITERATIONS]);
     c.bench_function("Array multiply of two integer arrays", |b| {
@@ -104,6 +111,7 @@ criterion_group!(
     sum_two_ints32_with_benchmark,
     sum_two_ints32_with_simd_benchmark,
     sum_two_ints32_with_mthreaded_benchmark,
+    sum_two_ints32_with_gpu_benchmark,
     mul_two_ints32_benchmark,
     mul_two_ints32_with_simd_benchmark,
     mul_two_ints32_with_mthreaded_benchmark,
diff --git a/benches/compare_benchmark_test.py b/benches/compare_benchmark_test.py
@@ -3,7 +3,7 @@
 import pytest
 import time
 
-NUM_ITERATIONS = 10_000_000
+NUM_ITERATIONS = 100_000_000
 
 
 @pytest.fixture(scope="module")
@@ -67,7 +67,7 @@ def result():
     assert result is not None
 
 
-@pytest.mark.benchmark(
+""" @pytest.mark.benchmark(
     group="rm_mul",
     min_time=0.1,
     max_time=0.5,
@@ -81,7 +81,7 @@ def test_rm_mul(benchmark, large_array):
     def result():
         return rm.multiply_two_nparr_ints32(large_array, large_array, "threading")
 
-    assert result is not None
+    assert result is not None """
 
 
 @pytest.mark.benchmark(
@@ -116,3 +116,20 @@ def result():
         return rm.sum_two_nparr_ints32(large_array, large_array, "threading")
 
     assert result is not None
+
+
+@pytest.mark.benchmark(
+    group="rm_sum_two(gpu)",
+    min_time=0.1,
+    max_time=0.5,
+    min_rounds=5,
+    timer=time.time,
+    disable_gc=True,
+    warmup=False,
+)
+def test_rm_sum_two_ints32_gpu(benchmark, large_array):
+    @benchmark
+    def result():
+        return rm.sum_two_nparr_ints32(large_array, large_array, "gpu")
+
+    assert result is not None
diff --git a/build.rs b/build.rs
@@ -0,0 +1,14 @@
+use std::env;
+
+fn main() {
+    if cfg!(windows) {
+        // Note: in windows should be installed OpenCL.lib file and other additional, before building
+        // make sure it is already installed
+        // Installation via vcpkg is recommended, run `vcpkg install opencl`
+        let open_cl_lib_path = env::var("OPEN_CL_LIB_PATH")
+            .unwrap_or("C:\\Users\\user\\vcpkg\\installed\\x64-windows\\lib".into());
+        println!("cargo:rustc-link-search=native={}", open_cl_lib_path);
+    }
+
+    println!("cargo:rustc-link-lib=dylib=OpenCL");
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "rem_math"
-version = "0.1.7"
+version = "0.2.7"
 description = ""
 authors = [
     {name = "WrldEngine",email = "kamran_pulatov@outlook.com"}
diff --git a/src/gpu.rs b/src/gpu.rs
@@ -0,0 +1,86 @@
+// NOTE: Work in progress, will be refactored
+
+extern crate ocl;
+use ocl::{Buffer, MemFlags, ProQue};
+
+const KERNEL_SRC: &'static str = include_str!("kernel.cl");
+
+pub fn sum_two_ints32(arr_1: &[i32], arr_2: &[i32], result_vec: &mut Vec<i64>) {
+    let pro_que = ProQue::builder()
+        .src(KERNEL_SRC)
+        .dims(arr_1.len())
+        .build()
+        .unwrap();
+
+    let buffer_1 = Buffer::builder()
+        .queue(pro_que.queue().clone())
+        .flags(MemFlags::new().read_write())
+        .len(arr_1.len())
+        .copy_host_slice(&arr_1)
+        .build()
+        .unwrap();
+
+    let buffer_2 = Buffer::builder()
+        .queue(pro_que.queue().clone())
+        .flags(MemFlags::new().read_write())
+        .len(arr_1.len())
+        .copy_host_slice(&arr_2)
+        .build()
+        .unwrap();
+
+    let result = pro_que.create_buffer::<i64>().unwrap();
+
+    let kernel = pro_que
+        .kernel_builder("add_i")
+        .arg(&buffer_1)
+        .arg(&buffer_2)
+        .arg(&result)
+        .build()
+        .unwrap();
+
+    unsafe {
+        kernel.enq().unwrap();
+    }
+
+    result.read(result_vec).enq().unwrap();
+}
+
+pub fn dot_float(arr_1: &[f32], arr_2: &[f32], result_vec: &mut Vec<f32>) {
+    let pro_que = ProQue::builder()
+        .src(KERNEL_SRC)
+        .dims(arr_1.len())
+        .build()
+        .unwrap();
+
+    let buffer_1 = Buffer::builder()
+        .queue(pro_que.queue().clone())
+        .flags(MemFlags::new().read_write())
+        .len(arr_1.len())
+        .copy_host_slice(&arr_1)
+        .build()
+        .unwrap();
+
+    let buffer_2 = Buffer::builder()
+        .queue(pro_que.queue().clone())
+        .flags(MemFlags::new().read_write())
+        .len(arr_1.len())
+        .copy_host_slice(&arr_2)
+        .build()
+        .unwrap();
+
+    let result = pro_que.create_buffer::<f32>().unwrap();
+
+    let kernel = pro_que
+        .kernel_builder("dot_f")
+        .arg(&buffer_1)
+        .arg(&buffer_2)
+        .arg(&result)
+        .build()
+        .unwrap();
+
+    unsafe {
+        kernel.enq().unwrap();
+    }
+
+    result.read(result_vec).enq().unwrap();
+}
diff --git a/src/kernel.cl b/src/kernel.cl
@@ -0,0 +1,19 @@
+__kernel void add_f(__global const float* buffer_1, __global const float* buffer_2, __global float* result) {
+	int idx = get_global_id(0);
+	result[idx] = buffer_1[idx] + buffer_2[idx];
+}
+
+__kernel void add_i(__global const int* buffer_1, __global const int* buffer_2, __global long* result) {
+	int idx = get_global_id(0);
+	result[idx] = (long)buffer_1[idx] + (long)buffer_2[idx];
+}
+
+__kernel void mul_i(__global const int* buffer_1, __global const int* buffer_2, __global long* result) {
+	int idx = get_global_id(0);
+	result[idx] = (long)buffer_1[idx] + (long)buffer_2[idx];
+}
+
+__kernel void dot_f(__global const float4* buffer_1, __global const float4* buffer_2, __global float* result) {
+	int idx = get_global_id(0);
+	result[idx] = dot(buffer_1[idx], buffer_2[idx]);
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,7 @@
 use numpy::{Complex64, IntoPyArray, PyArray1, PyReadonlyArray1};
 use pyo3::{exceptions, prelude::*};
 
+pub mod gpu;
 pub mod native;
 
 #[pyfunction]
@@ -110,7 +111,6 @@ pub fn multiply_two_nparr_ints32<'py>(
     }
 
     let result = native::multiply_two_ints32(arr_1.as_slice()?, arr_2.as_slice()?, method);
-
     Ok(result.into_pyarray(_py))
 }
 
diff --git a/src/native.rs b/src/native.rs
@@ -2,6 +2,8 @@
 use core::arch::x86_64::*;
 use rayon::prelude::*;
 
+use crate::gpu;
+
 const WAY_8_SZ: usize = 8;
 const WAY_4_SZ: usize = 4;
 
@@ -204,6 +206,10 @@ pub fn sum_two_ints32(arr_1: &[i32], arr_2: &[i32], method: &str) -> Vec<i64> {
                 .collect_into_vec(&mut result);
             result
         }
+        "gpu" => {
+            gpu::sum_two_ints32(arr_1, arr_2, &mut result);
+            result
+        }
         &_ => {
             for ((arr_3_val, arr_1_val), arr_2_val) in
                 result.iter_mut().zip(arr_1.iter()).zip(arr_2.iter())
diff --git a/tests/test_arrays.rs b/tests/test_arrays.rs
@@ -28,6 +28,7 @@ fn test_sum_two_ints32() {
     assert_eq!(expected_arr, sum_two_ints32(&arr, &arr, ""));
     assert_eq!(expected_arr, sum_two_ints32(&arr, &arr, "simd"));
     assert_eq!(expected_arr, sum_two_ints32(&arr, &arr, "threading"));
+    assert_eq!(expected_arr, sum_two_ints32(&arr, &arr, "gpu"));
 }
 
 #[test]

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`use numpy::{Complex64, IntoPyArray, PyArray1, PyReadonlyArray1};`
`2`	`2`	`use pyo3::{exceptions, prelude::*};`
`3`	`3`
	`4`	`+pub mod gpu;`
`4`	`5`	`pub mod native;`
`5`	`6`
`6`	`7`	`#[pyfunction]`
`@@ -110,7 +111,6 @@ pub fn multiply_two_nparr_ints32<'py>(`
`110`	`111`	`}`
`111`	`112`
`112`	`113`	`let result = native::multiply_two_ints32(arr_1.as_slice()?, arr_2.as_slice()?, method);`
`113`		`-`
`114`	`114`	`Ok(result.into_pyarray(_py))`
`115`	`115`	`}`
`116`	`116`