Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 63 additions & 14 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,90 @@ variables:
GIT_SUBMODULE_STRATEGY: none
ROOT_DIR: '$CI_PROJECT_DIR'
APPS: "tests"
PATH: '/home/gitlabci/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:/home/gitlabci/.local/bin'
PATH: '$HOME/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:$HOME/.local/bin'
OBJCACHE: ''
CC: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/gcc'
CXX: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/g++'
CMAKE: 'cmake-3.28.3'
python: 'python3'
python3: 'python3'
# Config to build and test
CI_CONFIG: 'cachepool_fpu_512'
SW_PREFIX: 'test-cachepool-'

default:
tags: [dolent]
tags: [shared]

stages:
- build
- test

.base:
artifacts:
when: always
expire_in: 1 day

build-vsim:
extends: .base
# ---------------------------------------------------------------------------
# Build stage: compile RTL and software for CI_CONFIG.
# Parallel jobs within the same pipeline share $HOME, so the toolchain
# installed by make quick-tool is automatically available to all test jobs.
# ---------------------------------------------------------------------------
build:
stage: build
timeout: 5h
timeout: 4h 30m
script:
- echo "Using CC=$CC"
- echo "Using CXX=$CXX"
- test -x "$CC"
- test -x "$CXX"
- make quick-tool
- python3 -m pip install --quiet dataclasses hjson jsonref jsonschema mako termcolor
- make init
- make dram-build
- cd util/auto-benchmark
- chmod +x ./run_ci.sh
- ./run_ci.sh
- make clean generate vsim config=$CI_CONFIG
artifacts:
when: always
expire_in: 1 day
paths:
# QuestaSim compiled work library
- sim/work/
# vsim wrapper scripts (exclude sim/bin/logs/ — not needed by test jobs)
- sim/bin/cachepool_cluster.vsim
# DPI shared library
- sim/work-dpi/
# Software binaries for all kernels
- software/build/CachePoolTests/
# DRAMSys shared libraries and config files (referenced by vsim at runtime)
- hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/build/lib/
- hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/configs/

# ---------------------------------------------------------------------------
# Test stage: run each kernel in parallel on a separate runner.
# Each job downloads the build artifacts, runs one simulation, and checks
# the output log for failures.
# ---------------------------------------------------------------------------
test:
stage: test
timeout: 1h
needs: [build]
parallel:
matrix:
- KERNEL:
- spin-lock
- load-store_M16
- fdotp-32b_M32768
- gemv_M512_N128_K32
- fmatmul-32b_M32_N32_K32
- fft-32b_M1024_N16
- multi_producer_single_consumer_double_linked_list_M1_N1350_K10
- byte-enable
script:
# The vsim script writes a .rtlbinary marker here; ensure the dir exists.
- mkdir -p sim/bin/logs
- chmod +x sim/bin/cachepool_cluster.vsim
- BIN="${SW_PREFIX}${KERNEL}"
- sim/bin/cachepool_cluster.vsim software/build/CachePoolTests/$BIN 2>&1 | tee test_${KERNEL}.log
- python3 util/auto-benchmark/check-ci.py test_${KERNEL}.log
artifacts:
when: always
expire_in: 1 day
paths:
- util/auto-benchmark/logs
# Full simulation log
- test_*.log
# Performance-monitor trace files written by the simulator
- sim/bin/logs/
2 changes: 1 addition & 1 deletion hardware/bootrom/bootrom.dump
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

/scratch2/diyou/cachepool/ManyRVData/hardware/bootrom/bootrom.elf: file format elf32-littleriscv
/scratch/diyou/cachepool/4t16c/ManyRVData/hardware/bootrom/bootrom.elf: file format elf32-littleriscv


Disassembly of section .text:
Expand Down
Binary file modified hardware/bootrom/bootrom.elf
Binary file not shown.
26 changes: 23 additions & 3 deletions hardware/src/cachepool_cluster.sv
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,9 @@ module cachepool_cluster
axi_narrow_req_t [NumTiles-1:0] axi_core_csr_req, axi_barrier_req;
axi_narrow_resp_t [NumTiles-1:0] axi_core_csr_rsp, axi_barrier_rsp;

// Spill register signals to cut the AXI feedback path
axi_in_req_t axi_in_req_reg;
axi_in_resp_t axi_in_resp_reg;

for (genvar t = 0; t < NumTiles; t++) begin
assign axi_barrier_req[t] = axi_out_req [t][ClusterPeriph];
Expand All @@ -877,6 +880,24 @@ module cachepool_cluster
// TODO: Connect to CSR
assign use_barrier = {NumTiles{1'b1}};

axi_cut #(
.Bypass (0 ),
.aw_chan_t (spatz_axi_in_aw_chan_t ),
.w_chan_t (spatz_axi_in_w_chan_t ),
.b_chan_t (spatz_axi_in_b_chan_t ),
.ar_chan_t (spatz_axi_in_ar_chan_t ),
.r_chan_t (spatz_axi_in_r_chan_t ),
.axi_req_t (spatz_axi_in_req_t ),
.axi_resp_t (spatz_axi_in_resp_t )
) i_cut_ext_narrow_in (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.slv_req_i (axi_in_req_i ),
.slv_resp_o (axi_in_resp_o ),
.mst_req_o (axi_in_req_reg ),
.mst_resp_i (axi_in_resp_reg )
);

cachepool_cluster_barrier #(
.AddrWidth (AxiAddrWidth ),
.NrPorts (NumTiles ),
Expand All @@ -895,7 +916,6 @@ module cachepool_cluster
.cluster_periph_start_address_i ( tcdm_end_address )
);


axi_mux #(
.SlvAxiIDWidth ( CsrAxiMstIdWidth ),
.slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports
Expand Down Expand Up @@ -923,8 +943,8 @@ module cachepool_cluster
.clk_i ( clk_i ), // Clock
.rst_ni ( rst_ni ), // Asynchronous reset active low
.test_i ('0 ), // Test Mode enable
.slv_reqs_i ( {axi_in_req_i, axi_core_csr_req} ),
.slv_resps_o ( {axi_in_resp_o, axi_core_csr_rsp} ),
.slv_reqs_i ( {axi_in_req_reg, axi_core_csr_req} ),
.slv_resps_o ( {axi_in_resp_reg, axi_core_csr_rsp} ),
.mst_req_o ( axi_csr_req ),
.mst_resp_i ( axi_csr_rsp )
);
Expand Down
111 changes: 74 additions & 37 deletions hardware/src/cachepool_tile.sv
Original file line number Diff line number Diff line change
Expand Up @@ -545,8 +545,10 @@ module cachepool_tile

// Used to determine the mapping policy between different cache banks.
// Set through CSR
logic [$clog2(TCDMAddrWidth)-1:0] dynamic_offset;
assign dynamic_offset = dynamic_offset_i;
logic [$clog2(TCDMAddrWidth)-1:0] dynamic_offset_d, dynamic_offset_q;
`FF(dynamic_offset_q, dynamic_offset_d, '0)
assign dynamic_offset_d = dynamic_offset_i;

// One entry per flat remote port: flat index = j + r*NrTCDMPortsPerCore
// where j is the xbar index and r is the remote slot within that xbar.
logic [NumRemotePortTile-1:0] remote_out_pready, remote_in_pready;
Expand Down Expand Up @@ -626,19 +628,19 @@ module cachepool_tile
.tcdm_req_chan_t (tcdm_req_chan_t ),
.tcdm_rsp_chan_t (tcdm_rsp_chan_t )
) i_cache_xbar (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.tile_id_i ( tile_id_i ),
.dynamic_offset_i ( dynamic_offset ),
.private_start_addr_i ( private_start_addr_i ),
.num_private_cache_i ( num_private_cache ),
.core_req_i ({xbar_remote_req_gated, cache_req [j]} ),
.core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ),
.core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ),
.tile_sel_o ( xbar_remote_req_dst ),
.mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ),
.mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ),
.mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} )
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.tile_id_i ( tile_id_i ),
.dynamic_offset_i ( dynamic_offset_q ),
.private_start_addr_i ( private_start_addr_i ),
.num_private_cache_i ( num_private_cache ),
.core_req_i ({xbar_remote_req_gated, cache_req [j]} ),
.core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ),
.core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ),
.tile_sel_o ( xbar_remote_req_dst ),
.mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ),
.mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ),
.mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} )
);
end

Expand Down Expand Up @@ -713,21 +715,56 @@ module cachepool_tile
assign cache_rsp_reg.p.write = cache_rsp_write[cb][j];

end else begin : gen_no_amo
// Bypass AMO and registers
assign cache_req_valid[cb][j] = cache_xbar_req [j][cb].q_valid;
assign cache_rsp_ready[cb][j] = cache_xbar_pready[j][cb];
assign cache_req_addr [cb][j] = cache_xbar_req [j][cb].q.addr;
assign cache_req_meta [cb][j] = cache_xbar_req [j][cb].q.user;
assign cache_req_write[cb][j] = cache_xbar_req [j][cb].q.write;
assign cache_req_data [cb][j] = cache_xbar_req [j][cb].q.data;
assign cache_req_strb [cb][j] = cache_xbar_req [j][cb].q.strb;

assign cache_xbar_rsp[j][cb].p_valid = cache_rsp_valid[cb][j];
assign cache_xbar_rsp[j][cb].q_ready = cache_req_ready[cb][j];
assign cache_xbar_rsp[j][cb].p.data = cache_rsp_data [cb][j];
assign cache_xbar_rsp[j][cb].p.user = cache_rsp_meta [cb][j];

assign cache_xbar_rsp[j][cb].p.write = cache_rsp_write[cb][j];
// Spill register decoupling between xbar side and cache side.
tcdm_req_t cache_req_reg;
tcdm_rsp_t cache_rsp_reg;

// Extra wire for half-handshake: q_ready feedback from the response-side
// spill register's downstream consumer back to the request-side spill
// register's ready_i input.
logic cache_req_ready_w;

spill_register #(
.T ( tcdm_req_chan_t ),
.Bypass ( 1'b0 )
) i_spill_reg_cache_req (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.valid_i ( cache_xbar_req [j][cb].q_valid ),
.ready_o ( cache_xbar_rsp [j][cb].q_ready ),
.data_i ( cache_xbar_req [j][cb].q ),
.valid_o ( cache_req_reg.q_valid ),
.ready_i ( cache_req_ready_w ),
.data_o ( cache_req_reg.q )
);

spill_register #(
.T ( tcdm_rsp_chan_t ),
.Bypass ( 1'b1 )
) i_spill_reg_cache_rsp (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.valid_i ( cache_rsp_reg.p_valid ),
.ready_o ( cache_rsp_ready [cb][j] ),
.data_i ( cache_rsp_reg.p ),
.valid_o ( cache_xbar_rsp [j][cb].p_valid ),
.ready_i ( cache_xbar_pready[j][cb] ),
.data_o ( cache_xbar_rsp [j][cb].p )
);

assign cache_req_ready_w = cache_req_ready[cb][j];

assign cache_req_valid[cb][j] = cache_req_reg.q_valid;
assign cache_req_addr [cb][j] = cache_req_reg.q.addr;
assign cache_req_meta [cb][j] = cache_req_reg.q.user;
assign cache_req_write[cb][j] = cache_req_reg.q.write;
assign cache_req_data [cb][j] = cache_req_reg.q.data;
assign cache_req_strb [cb][j] = cache_req_reg.q.strb;

assign cache_rsp_reg.p_valid = cache_rsp_valid[cb][j];
assign cache_rsp_reg.p.data = cache_rsp_data [cb][j];
assign cache_rsp_reg.p.user = cache_rsp_meta [cb][j];
assign cache_rsp_reg.p.write = cache_rsp_write[cb][j];

end
end
Expand Down Expand Up @@ -757,12 +794,12 @@ module cachepool_tile
$display(" NumDataBankPerCtrl: %0d", NumDataBankPerCtrl);
$display(" CoalFactor : %0d", L1CoalFactor);
$display(" RefillDataWidth: %0d", RefillDataWidth);
$display(" DynamicOffset : %0d", dynamic_offset);
$display(" DynamicOffset : %0d", dynamic_offset_q);
end

// CL-offset mask: bits below dynamic_offset, verbatim in both directions.
// CL-offset mask: bits below dynamic_offset_q, verbatim in both directions.
logic [SpatzAxiAddrWidth-1:0] bitmask_lo;
assign bitmask_lo = (SpatzAxiAddrWidth'(1) << dynamic_offset) - 1;
assign bitmask_lo = (SpatzAxiAddrWidth'(1) << dynamic_offset_q) - 1;

cache_refill_req_chan_t [NumL1CtrlTile-1 : 0] cache_refill_req;
burst_req_t [NumL1CtrlTile-1 : 0] cache_refill_burst;
Expand Down Expand Up @@ -1031,14 +1068,14 @@ module cachepool_tile

rot_field = addr_rot >> (SpatzAxiAddrWidth - refill_bits_to_rotate);

upper = (addr_rot >> dynamic_offset)
upper = (addr_rot >> dynamic_offset_q)
& ((SpatzAxiAddrWidth'(1) << (SpatzAxiAddrWidth
- dynamic_offset
- dynamic_offset_q
- refill_bits_to_rotate)) - 1);

cache_refill_req_o[cb].q.addr = lower
| (rot_field << dynamic_offset)
| (upper << (dynamic_offset
| (rot_field << dynamic_offset_q)
| (upper << (dynamic_offset_q
+ refill_bits_to_rotate));
end
end
Expand Down
11 changes: 9 additions & 2 deletions sim/sim.mk
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,24 @@ ${WORK_DIR}/compile.vsim.tcl: ${SNLIB_DIR}/rtl_lib.cc ${SNLIB_DIR}/common_lib.cc
echo 'return 0' >> $@

# Wrapper script & GUI script
# The generated scripts derive ROOT_DIR from their own location at runtime so
# that they remain portable across different checkout paths (CI runners, moved
# repos). All absolute paths baked in by make are replaced by a single sed pass.
define QUESTASIM
${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log
@! grep -P "Errors: [1-9]*," $(dir $<)vsim.log
@mkdir -p $(SIMBIN_DIR) $(SIMBIN_DIR)/logs
@echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim
@echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim
@echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim
@echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim
@echo '${VSIM} +permissive ${VSIM_FLAGS} -do "run -a" -work ${WORK_DIR} -c -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim
@sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim
@chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim
@echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
@echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
@echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
@echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
@echo '${VSIM} +permissive ${VSIM_FLAGS} -do "log -r /*; source ${WAVE_FILE}; run -a" -work ${WORK_DIR} -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
@sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
@chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim.gui
endef

Expand Down
3 changes: 2 additions & 1 deletion software/tests/fft-32b/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ int main() {
}
}

printf ("r:%d,i:%d\n", rerror, ierror);
if ((rerror + ierror) > 0)
printf ("Error: r:%d,i:%d\n", rerror, ierror);
}
}

Expand Down
Loading