Multi-Level Cache Performance Analysis

gem5 v25.1.0.0 • RISC-V SE Mode • Fedora 43 • TimingSimpleCPU • DDR4-2400

single_level_cache.py

single_level_cache.py
from m5.objects import *
import m5
import sys

args = sys.argv[1:]
if "--" in args:
    args = args[args.index("--") + 1:]
if len(args) == 0:
    print("Usage: build/RISCV/gem5.opt script.py -- <binary> [args...]")
    exit(1)

binary = args[0]
binary_args = args[1:]
print(f"Binary: {binary}")
print(f"Arguments: {binary_args}")

system = System()
system.clk_domain = SrcClockDomain()
system.clk_domain.clock = '2GHz'
system.clk_domain.voltage_domain = VoltageDomain()
system.mem_mode = 'timing'
system.mem_ranges = [AddrRange('16GB')]

system.cpu = TimingSimpleCPU()
system.membus = SystemXBar()

class L1Cache(Cache):
    assoc = 4
    tag_latency = 2
    data_latency = 2
    response_latency = 2
    mshrs = 16
    tgts_per_mshr = 20

system.cpu.icache = L1Cache(size='64kB')
system.cpu.icache.cpu_side = system.cpu.icache_port
system.cpu.icache.mem_side = system.membus.cpu_side_ports

system.cpu.dcache = L1Cache(size='64kB')
system.cpu.dcache.cpu_side = system.cpu.dcache_port
system.cpu.dcache.mem_side = system.membus.cpu_side_ports

system.cpu.createInterruptController()

system.mem_ctrl = MemCtrl()
system.mem_ctrl.dram = DDR4_2400_8x8()
system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.mem_side_ports

process = Process()
process.cmd = [binary] + binary_args
system.workload = SEWorkload.init_compatible(binary)
system.cpu.workload = process
system.cpu.createThreads()

system.system_port = system.membus.cpu_side_ports

root = Root(full_system=False, system=system)
m5.instantiate()

print("Starting simulation...")
exit_event = m5.simulate()
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")

multi_level_cache.py

multi_level_cache.py
from m5.objects import *
import m5
import sys

args = sys.argv[1:]
if "--" in args:
    args = args[args.index("--") + 1:]
if len(args) == 0:
    print("Usage: build/RISCV/gem5.opt multilevel_cache.py -- <binary> [args...]")
    exit(1)

binary = args[0]
binary_args = args[1:]
print(f"Binary: {binary}")
print(f"Arguments: {binary_args}")

system = System()
system.clk_domain = SrcClockDomain()
system.clk_domain.clock = '2GHz'
system.clk_domain.voltage_domain = VoltageDomain()
system.mem_mode = 'timing'
system.mem_ranges = [AddrRange('16GB')]

system.cpu = TimingSimpleCPU()
system.l2bus = L2XBar()
system.l3bus = L2XBar()
system.membus = SystemXBar()

class L1Cache(Cache):
    assoc = 4
    tag_latency = 2
    data_latency = 2
    response_latency = 2
    mshrs = 16
    tgts_per_mshr = 20

class L2Cache(Cache):
    assoc = 8
    tag_latency = 8
    data_latency = 8
    response_latency = 8
    mshrs = 32
    tgts_per_mshr = 20

class L3Cache(Cache):
    assoc = 16
    tag_latency = 20
    data_latency = 20
    response_latency = 20
    mshrs = 64
    tgts_per_mshr = 20

system.cpu.icache = L1Cache(size='64kB')
system.cpu.dcache = L1Cache(size='64kB')
system.cpu.icache.cpu_side = system.cpu.icache_port
system.cpu.dcache.cpu_side = system.cpu.dcache_port
system.cpu.icache.mem_side = system.l2bus.cpu_side_ports
system.cpu.dcache.mem_side = system.l2bus.cpu_side_ports

system.l2cache = L2Cache(size='512kB')
system.l2cache.cpu_side = system.l2bus.mem_side_ports
system.l2cache.mem_side = system.l3bus.cpu_side_ports

system.l3cache = L3Cache(size='2MB')
system.l3cache.cpu_side = system.l3bus.mem_side_ports
system.l3cache.mem_side = system.membus.cpu_side_ports

system.cpu.createInterruptController()

system.mem_ctrl = MemCtrl()
system.mem_ctrl.dram = DDR4_2400_8x8()
system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.mem_side_ports

process = Process()
process.cmd = [binary] + binary_args
system.workload = SEWorkload.init_compatible(binary)
system.cpu.workload = process
system.cpu.createThreads()

system.system_port = system.membus.cpu_side_ports

root = Root(full_system=False, system=system)
m5.instantiate()

print("Starting simulation (Multi-Level Cache)...")
exit_event = m5.simulate()
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")

benchmark.c

7 access patterns: sequential, hot loop, strided, matrix multiply, memory copy, random access, linked list traversal.

benchmark.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MATRIX_SIZE  128
#define ARRAY_SIZE   65536
#define RANDOM_ITERS 50000
#define STRIDE       16
#define LIST_SIZE    8192

typedef struct Node { int value; struct Node* next; } Node;

static int A[MATRIX_SIZE][MATRIX_SIZE];
static int B[MATRIX_SIZE][MATRIX_SIZE];
static int C[MATRIX_SIZE][MATRIX_SIZE];
static int arr[ARRAY_SIZE];
static int result_arr[ARRAY_SIZE];

static unsigned int seed = 12345;
static unsigned int pseudo_rand() {
    seed ^= seed << 13; seed ^= seed >> 17; seed ^= seed << 5;
    return seed;
}

void test_matrix_multiply() {
    int i, j, k;
    for (i = 0; i < MATRIX_SIZE; i++)
        for (j = 0; j < MATRIX_SIZE; j++) {
            A[i][j] = (i * MATRIX_SIZE + j) % 100;
            B[i][j] = (i + j) % 100; C[i][j] = 0;
        }
    for (i = 0; i < MATRIX_SIZE; i++)
        for (k = 0; k < MATRIX_SIZE; k++)
            for (j = 0; j < MATRIX_SIZE; j++)
                C[i][j] += A[i][k] * B[k][j];
    volatile int sink = C[MATRIX_SIZE/2][MATRIX_SIZE/2]; (void)sink;
    printf("[1] Matrix Multiply done. C[64][64] = %d\n", C[MATRIX_SIZE/2][MATRIX_SIZE/2]);
}

void test_sequential_access() {
    int i; long sum = 0;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 1000;
    for (i = 0; i < ARRAY_SIZE; i++) sum += arr[i];
    for (i = 0; i < ARRAY_SIZE; i++) result_arr[i] = arr[i] * 2;
    printf("[2] Sequential Access done. Sum = %ld\n", sum);
}

void test_random_access() {
    int i; long sum = 0; unsigned int idx;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i;
    for (i = 0; i < RANDOM_ITERS; i++) { idx = pseudo_rand() % ARRAY_SIZE; sum += arr[idx]; }
    for (i = 0; i < RANDOM_ITERS; i++) { idx = pseudo_rand() % ARRAY_SIZE; arr[idx] = (int)(sum % 1000); }
    printf("[3] Random Access done. Sum = %ld\n", sum);
}

void test_strided_access() {
    int i; long sum = 0;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 500;
    for (i = 0; i < ARRAY_SIZE; i += STRIDE) sum += arr[i];
    for (i = 0; i < ARRAY_SIZE; i += STRIDE) arr[i] = (int)(sum % 1000);
    printf("[4] Strided Access done (stride=%d). Sum = %ld\n", STRIDE, sum);
}

void test_linked_list() {
    int i; long sum = 0;
    Node* nodes = (Node*)malloc(LIST_SIZE * sizeof(Node));
    if (!nodes) { printf("[5] malloc failed\n"); return; }
    for (i = 0; i < LIST_SIZE - 1; i++) { nodes[i].value = i; nodes[i].next = &nodes[i+1]; }
    nodes[LIST_SIZE-1].value = LIST_SIZE-1; nodes[LIST_SIZE-1].next = NULL;
    for (i = LIST_SIZE-1; i > 0; i--) {
        unsigned int j = pseudo_rand() % (i+1);
        Node* tmp = nodes[i].next; nodes[i].next = nodes[j].next; nodes[j].next = tmp;
    }
    Node* cur = &nodes[0];
    while (cur != NULL) { sum += cur->value; cur = cur->next; }
    printf("[5] Linked List Traversal done. Sum = %ld\n", sum);
    free(nodes);
}

void test_hot_loop() {
    int i, j; long sum = 0; int hot[512];
    for (i = 0; i < 512; i++) hot[i] = i;
    for (j = 0; j < 200; j++) for (i = 0; i < 512; i++) sum += hot[i];
    printf("[6] Hot Loop done. Sum = %ld\n", sum);
}

void test_memory_copy() {
    int i;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 256;
    for (i = 0; i < ARRAY_SIZE; i++) result_arr[i] = arr[i];
    int ok = 1;
    for (i = 0; i < 100; i++) if (result_arr[i] != arr[i]) { ok = 0; break; }
    printf("[7] Memory Copy done. Verify: %s\n", ok ? "PASS" : "FAIL");
}

int main() {
    printf("==============================================\n");
    printf("  COA Cache Benchmark\n");
    printf("==============================================\n\n");
    printf("--- Starting Benchmark Suite ---\n\n");
    test_sequential_access();
    test_hot_loop();
    test_strided_access();
    test_matrix_multiply();
    test_memory_copy();
    test_random_access();
    test_linked_list();
    printf("\n--- Benchmark Complete ---\n");
    printf("Check m5out/stats.txt for cache metrics.\n");
    return 0;
}

Compilation & Run Commands

Compile benchmark for RISC-V:

bash
~/riscv/bin/riscv64-unknown-elf-gcc -static benchmark.c -o benchmark

Run single-level cache simulation:

bash
./build/RISCV/gem5.opt single_level_cache.py -- ./benchmark
mv m5out/stats.txt m5out/stats_single.txt

Run multi-level cache simulation:

bash
./build/RISCV/gem5.opt multi_level_cache.py -- ./benchmark
mv m5out/stats.txt m5out/stats_multi.txt

Extract metrics:

bash
cat m5out/stats_single.txt | grep -E "missRate|numCycles|overallAvgMissLatency"
cat m5out/stats_multi.txt  | grep -E "missRate|numCycles|overallAvgMissLatency"

Terminal Output — Single Level

royce@fedora ~/gem5
$ ./build/RISCV/gem5.opt single_level_cache.py -- ./benchmark
warn: Base 10 memory/cache sizes cast to base 2 (16GiB, 64KiB x2)
info: RVV enabled, VLEN = 256 bits, ELEN = 64 bits
Starting simulation...
[2] Sequential Access done. Sum = 32610880
[6] Hot Loop done. Sum = 26163200
[4] Strided Access done (stride=16). Sum = 1015460
[1] Matrix Multiply done. C[64][64] = 295312
[7] Memory Copy done. Verify: PASS
[3] Random Access done. Sum = 1638151029
[5] Linked List Traversal done. Sum = 9560438
--- Benchmark Complete ---
Exiting @ tick 179555699000 because exiting with last active thread context

Terminal Output — Multi Level

royce@fedora ~/gem5
$ ./build/RISCV/gem5.opt multi_level_cache.py -- ./benchmark
warn: Base 10 memory/cache sizes cast to base 2 (16GiB, 64KiB, 512KiB, 2MiB)
info: RVV enabled, VLEN = 256 bits, ELEN = 64 bits
Starting simulation (Multi-Level Cache)...
[2] Sequential Access done. Sum = 32610880
[6] Hot Loop done. Sum = 26163200
[4] Strided Access done (stride=16). Sum = 1015460
[1] Matrix Multiply done. C[64][64] = 295312
[7] Memory Copy done. Verify: PASS
[3] Random Access done. Sum = 1638151029
[5] Linked List Traversal done. Sum = 9560438
--- Benchmark Complete ---
Exiting @ tick 172574311500 because exiting with last active thread context

Results — Execution Cycles

ConfigurationCyclesImprovement
Single Level (L1 only)359,111,398baseline
Multi Level (L1+L2+L3)345,148,623▼ 3.89% faster

Results — Cache Miss Rates

Cache LevelMetricSingle LevelMulti Level
L1 I-CacheOverall Miss Rate0.0001%0.0001%
L1 D-CacheOverall Miss Rate0.41%0.41%
L1 D-CacheRead Miss Rate0.22%0.22%
L1 D-CacheWrite Miss Rate1.31%1.31%
L2 CacheOverall Miss RateN/A14.26%
L2 CacheHit Rate (caught L1 misses)N/A85.74%
L3 CacheOverall Miss RateN/A61.87%
L3 CacheHit Rate (caught L2 misses)N/A38.13%

Results — Average Miss Latency

CacheMetricSingle Level (ticks)Multi Level (ticks)Reduction
L1 D-CacheAvg Miss Latency59,59814,144▼ 76.27%
L1 D-CacheRead Miss Latency59,7047,035▼ 88.22%
L1 D-CacheWrite Miss Latency59,50820,237▼ 66.00%
L2 CacheAvg Miss LatencyN/A55,139
L3 CacheAvg Miss LatencyN/A68,105

AMAT

Formula: AMAT = Hit Time + (Miss Rate × Miss Penalty)  |  L1 Hit Time = 2 cycles  |  Clock = 2GHz
ConfigurationCalculationAMAT
Single Level2 + (0.004051 × 59,598)~243.5 cycles
Multi Level2 + (0.004051 × 14,144)~59.3 cycles
AMAT improved 75.6% — from ~243.5 cycles to ~59.3 cycles.

Key Findings

1. Multi-level cache reduced execution cycles by 3.89% (359M → 345M).
2. D-Cache miss latency dropped 76.27% — 59,598 → 14,144 ticks per miss.
3. Read miss latency improved 88.22% — 59,704 → 7,035 ticks.
4. AMAT improved 75.6% — ~243.5 → ~59.3 cycles.
5. L2 caught 85.74% of all L1 misses before reaching DRAM.
6. L3 caught an additional 38.13% of L2 misses.
Note: L1 miss rates are identical in both configs (0.41%) — expected. The difference is what happens after a miss: single-level goes straight to DRAM; multi-level traverses L2 → L3 → DRAM, dramatically reducing penalty.