Skip to content

Commit 294d3c5

Browse files
committed
wip: GPU operations integrations with OpenCL
1 parent 709381e commit 294d3c5

11 files changed

Lines changed: 168 additions & 6 deletions

File tree

.github/workflows/release.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ jobs:
3636
- runner: windows-latest
3737
target: x86
3838
steps:
39+
- name: Installing OpenCL
40+
run: |
41+
sudo apt update
42+
sudo apt install ocl-icd-opencl-dev
43+
3944
- uses: actions/checkout@v4
4045
- name: Testing Rust modules
4146
run: |
@@ -156,6 +161,11 @@ jobs:
156161
contents: write
157162
attestations: write
158163
steps:
164+
- name: Installing OpenCL
165+
run: |
166+
sudo apt update
167+
sudo apt install ocl-icd-opencl-dev
168+
159169
- uses: actions/download-artifact@v4
160170
- name: Generate artifact attestation
161171
uses: actions/attest-build-provenance@v2

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "rem_math"
3-
version = "0.1.7"
3+
version = "0.2.7"
44
edition = "2021"
55

66
[lib]
@@ -19,6 +19,7 @@ crate-type = ["cdylib", "rlib"]
1919
numpy = "0.25.0"
2020
pyo3 = { version = "0.25.1", features = ["extension-module"] }
2121
rayon = "1.10.0"
22+
ocl = "0.19"
2223

2324
[dev-dependencies]
2425
criterion = "0.3"

benches/common.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,13 @@ fn sum_two_ints32_with_mthreaded_benchmark(c: &mut Criterion) {
7272
);
7373
}
7474

75+
fn sum_two_ints32_with_gpu_benchmark(c: &mut Criterion) {
76+
let arr = black_box(vec![1; NUM_ITERATIONS]);
77+
c.bench_function("Array accumulation of two integer arrays with GPU", |b| {
78+
b.iter(|| sum_two_ints32(&arr, &arr, "GPU"))
79+
});
80+
}
81+
7582
fn mul_two_ints32_benchmark(c: &mut Criterion) {
7683
let arr = black_box(vec![1; NUM_ITERATIONS]);
7784
c.bench_function("Array multiply of two integer arrays", |b| {
@@ -104,6 +111,7 @@ criterion_group!(
104111
sum_two_ints32_with_benchmark,
105112
sum_two_ints32_with_simd_benchmark,
106113
sum_two_ints32_with_mthreaded_benchmark,
114+
sum_two_ints32_with_gpu_benchmark,
107115
mul_two_ints32_benchmark,
108116
mul_two_ints32_with_simd_benchmark,
109117
mul_two_ints32_with_mthreaded_benchmark,

benches/compare_benchmark_test.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pytest
44
import time
55

6-
NUM_ITERATIONS = 10_000_000
6+
NUM_ITERATIONS = 100_000_000
77

88

99
@pytest.fixture(scope="module")
@@ -67,7 +67,7 @@ def result():
6767
assert result is not None
6868

6969

70-
@pytest.mark.benchmark(
70+
""" @pytest.mark.benchmark(
7171
group="rm_mul",
7272
min_time=0.1,
7373
max_time=0.5,
@@ -81,7 +81,7 @@ def test_rm_mul(benchmark, large_array):
8181
def result():
8282
return rm.multiply_two_nparr_ints32(large_array, large_array, "threading")
8383
84-
assert result is not None
84+
assert result is not None """
8585

8686

8787
@pytest.mark.benchmark(
@@ -116,3 +116,20 @@ def result():
116116
return rm.sum_two_nparr_ints32(large_array, large_array, "threading")
117117

118118
assert result is not None
119+
120+
121+
@pytest.mark.benchmark(
122+
group="rm_sum_two(gpu)",
123+
min_time=0.1,
124+
max_time=0.5,
125+
min_rounds=5,
126+
timer=time.time,
127+
disable_gc=True,
128+
warmup=False,
129+
)
130+
def test_rm_sum_two_ints32_gpu(benchmark, large_array):
131+
@benchmark
132+
def result():
133+
return rm.sum_two_nparr_ints32(large_array, large_array, "gpu")
134+
135+
assert result is not None

build.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
use std::env;
2+
3+
fn main() {
4+
if cfg!(windows) {
5+
// Note: in windows should be installed OpenCL.lib file and other additional, before building
6+
// make sure it is already installed
7+
// Installation via vcpkg is recommended, run `vcpkg install opencl`
8+
let open_cl_lib_path = env::var("OPEN_CL_LIB_PATH")
9+
.unwrap_or("C:\\Users\\user\\vcpkg\\installed\\x64-windows\\lib".into());
10+
println!("cargo:rustc-link-search=native={}", open_cl_lib_path);
11+
}
12+
13+
println!("cargo:rustc-link-lib=dylib=OpenCL");
14+
}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "rem_math"
3-
version = "0.1.7"
3+
version = "0.2.7"
44
description = ""
55
authors = [
66
{name = "WrldEngine",email = "kamran_pulatov@outlook.com"}

src/gpu.rs

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// NOTE: Work in progress, will be refactored
2+
3+
extern crate ocl;
4+
use ocl::{Buffer, MemFlags, ProQue};
5+
6+
const KERNEL_SRC: &'static str = include_str!("kernel.cl");
7+
8+
pub fn sum_two_ints32(arr_1: &[i32], arr_2: &[i32], result_vec: &mut Vec<i64>) {
9+
let pro_que = ProQue::builder()
10+
.src(KERNEL_SRC)
11+
.dims(arr_1.len())
12+
.build()
13+
.unwrap();
14+
15+
let buffer_1 = Buffer::builder()
16+
.queue(pro_que.queue().clone())
17+
.flags(MemFlags::new().read_write())
18+
.len(arr_1.len())
19+
.copy_host_slice(&arr_1)
20+
.build()
21+
.unwrap();
22+
23+
let buffer_2 = Buffer::builder()
24+
.queue(pro_que.queue().clone())
25+
.flags(MemFlags::new().read_write())
26+
.len(arr_1.len())
27+
.copy_host_slice(&arr_2)
28+
.build()
29+
.unwrap();
30+
31+
let result = pro_que.create_buffer::<i64>().unwrap();
32+
33+
let kernel = pro_que
34+
.kernel_builder("add_i")
35+
.arg(&buffer_1)
36+
.arg(&buffer_2)
37+
.arg(&result)
38+
.build()
39+
.unwrap();
40+
41+
unsafe {
42+
kernel.enq().unwrap();
43+
}
44+
45+
result.read(result_vec).enq().unwrap();
46+
}
47+
48+
pub fn dot_float(arr_1: &[f32], arr_2: &[f32], result_vec: &mut Vec<f32>) {
49+
let pro_que = ProQue::builder()
50+
.src(KERNEL_SRC)
51+
.dims(arr_1.len())
52+
.build()
53+
.unwrap();
54+
55+
let buffer_1 = Buffer::builder()
56+
.queue(pro_que.queue().clone())
57+
.flags(MemFlags::new().read_write())
58+
.len(arr_1.len())
59+
.copy_host_slice(&arr_1)
60+
.build()
61+
.unwrap();
62+
63+
let buffer_2 = Buffer::builder()
64+
.queue(pro_que.queue().clone())
65+
.flags(MemFlags::new().read_write())
66+
.len(arr_1.len())
67+
.copy_host_slice(&arr_2)
68+
.build()
69+
.unwrap();
70+
71+
let result = pro_que.create_buffer::<f32>().unwrap();
72+
73+
let kernel = pro_que
74+
.kernel_builder("dot_f")
75+
.arg(&buffer_1)
76+
.arg(&buffer_2)
77+
.arg(&result)
78+
.build()
79+
.unwrap();
80+
81+
unsafe {
82+
kernel.enq().unwrap();
83+
}
84+
85+
result.read(result_vec).enq().unwrap();
86+
}

src/kernel.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
__kernel void add_f(__global const float* buffer_1, __global const float* buffer_2, __global float* result) {
2+
int idx = get_global_id(0);
3+
result[idx] = buffer_1[idx] + buffer_2[idx];
4+
}
5+
6+
__kernel void add_i(__global const int* buffer_1, __global const int* buffer_2, __global long* result) {
7+
int idx = get_global_id(0);
8+
result[idx] = (long)buffer_1[idx] + (long)buffer_2[idx];
9+
}
10+
11+
__kernel void mul_i(__global const int* buffer_1, __global const int* buffer_2, __global long* result) {
12+
int idx = get_global_id(0);
13+
result[idx] = (long)buffer_1[idx] + (long)buffer_2[idx];
14+
}
15+
16+
__kernel void dot_f(__global const float4* buffer_1, __global const float4* buffer_2, __global float* result) {
17+
int idx = get_global_id(0);
18+
result[idx] = dot(buffer_1[idx], buffer_2[idx]);
19+
}

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use numpy::{Complex64, IntoPyArray, PyArray1, PyReadonlyArray1};
22
use pyo3::{exceptions, prelude::*};
33

4+
pub mod gpu;
45
pub mod native;
56

67
#[pyfunction]
@@ -110,7 +111,6 @@ pub fn multiply_two_nparr_ints32<'py>(
110111
}
111112

112113
let result = native::multiply_two_ints32(arr_1.as_slice()?, arr_2.as_slice()?, method);
113-
114114
Ok(result.into_pyarray(_py))
115115
}
116116

src/native.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
use core::arch::x86_64::*;
33
use rayon::prelude::*;
44

5+
use crate::gpu;
6+
57
const WAY_8_SZ: usize = 8;
68
const WAY_4_SZ: usize = 4;
79

@@ -204,6 +206,10 @@ pub fn sum_two_ints32(arr_1: &[i32], arr_2: &[i32], method: &str) -> Vec<i64> {
204206
.collect_into_vec(&mut result);
205207
result
206208
}
209+
"gpu" => {
210+
gpu::sum_two_ints32(arr_1, arr_2, &mut result);
211+
result
212+
}
207213
&_ => {
208214
for ((arr_3_val, arr_1_val), arr_2_val) in
209215
result.iter_mut().zip(arr_1.iter()).zip(arr_2.iter())

0 commit comments

Comments
 (0)