Multi-Level Cache Performance Analysis

single_level_cache.py

from m5.objects import *
import m5
import sys

args = sys.argv[1:]
if "--" in args:
    args = args[args.index("--") + 1:]
if len(args) == 0:
    print("Usage: build/RISCV/gem5.opt script.py -- <binary> [args...]")
    exit(1)

binary = args[0]
binary_args = args[1:]
print(f"Binary: {binary}")
print(f"Arguments: {binary_args}")

system = System()
system.clk_domain = SrcClockDomain()
system.clk_domain.clock = '2GHz'
system.clk_domain.voltage_domain = VoltageDomain()
system.mem_mode = 'timing'
system.mem_ranges = [AddrRange('16GB')]

system.cpu = TimingSimpleCPU()
system.membus = SystemXBar()

class L1Cache(Cache):
    assoc = 4
    tag_latency = 2
    data_latency = 2
    response_latency = 2
    mshrs = 16
    tgts_per_mshr = 20

system.cpu.icache = L1Cache(size='64kB')
system.cpu.icache.cpu_side = system.cpu.icache_port
system.cpu.icache.mem_side = system.membus.cpu_side_ports

system.cpu.dcache = L1Cache(size='64kB')
system.cpu.dcache.cpu_side = system.cpu.dcache_port
system.cpu.dcache.mem_side = system.membus.cpu_side_ports

system.cpu.createInterruptController()

system.mem_ctrl = MemCtrl()
system.mem_ctrl.dram = DDR4_2400_8x8()
system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.mem_side_ports

process = Process()
process.cmd = [binary] + binary_args
system.workload = SEWorkload.init_compatible(binary)
system.cpu.workload = process
system.cpu.createThreads()

system.system_port = system.membus.cpu_side_ports

root = Root(full_system=False, system=system)
m5.instantiate()

print("Starting simulation...")
exit_event = m5.simulate()
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")

multi_level_cache.py

from m5.objects import *
import m5
import sys

args = sys.argv[1:]
if "--" in args:
    args = args[args.index("--") + 1:]
if len(args) == 0:
    print("Usage: build/RISCV/gem5.opt multilevel_cache.py -- <binary> [args...]")
    exit(1)

binary = args[0]
binary_args = args[1:]
print(f"Binary: {binary}")
print(f"Arguments: {binary_args}")

system = System()
system.clk_domain = SrcClockDomain()
system.clk_domain.clock = '2GHz'
system.clk_domain.voltage_domain = VoltageDomain()
system.mem_mode = 'timing'
system.mem_ranges = [AddrRange('16GB')]

system.cpu = TimingSimpleCPU()
system.l2bus = L2XBar()
system.l3bus = L2XBar()
system.membus = SystemXBar()

class L1Cache(Cache):
    assoc = 4
    tag_latency = 2
    data_latency = 2
    response_latency = 2
    mshrs = 16
    tgts_per_mshr = 20

class L2Cache(Cache):
    assoc = 8
    tag_latency = 8
    data_latency = 8
    response_latency = 8
    mshrs = 32
    tgts_per_mshr = 20

class L3Cache(Cache):
    assoc = 16
    tag_latency = 20
    data_latency = 20
    response_latency = 20
    mshrs = 64
    tgts_per_mshr = 20

system.cpu.icache = L1Cache(size='64kB')
system.cpu.dcache = L1Cache(size='64kB')
system.cpu.icache.cpu_side = system.cpu.icache_port
system.cpu.dcache.cpu_side = system.cpu.dcache_port
system.cpu.icache.mem_side = system.l2bus.cpu_side_ports
system.cpu.dcache.mem_side = system.l2bus.cpu_side_ports

system.l2cache = L2Cache(size='512kB')
system.l2cache.cpu_side = system.l2bus.mem_side_ports
system.l2cache.mem_side = system.l3bus.cpu_side_ports

system.l3cache = L3Cache(size='2MB')
system.l3cache.cpu_side = system.l3bus.mem_side_ports
system.l3cache.mem_side = system.membus.cpu_side_ports

system.cpu.createInterruptController()

system.mem_ctrl = MemCtrl()
system.mem_ctrl.dram = DDR4_2400_8x8()
system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.mem_side_ports

process = Process()
process.cmd = [binary] + binary_args
system.workload = SEWorkload.init_compatible(binary)
system.cpu.workload = process
system.cpu.createThreads()

system.system_port = system.membus.cpu_side_ports

root = Root(full_system=False, system=system)
m5.instantiate()

print("Starting simulation (Multi-Level Cache)...")
exit_event = m5.simulate()
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")

benchmark.c

7 access patterns: sequential, hot loop, strided, matrix multiply, memory copy, random access, linked list traversal.

benchmark.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MATRIX_SIZE  128
#define ARRAY_SIZE   65536
#define RANDOM_ITERS 50000
#define STRIDE       16
#define LIST_SIZE    8192

typedef struct Node { int value; struct Node* next; } Node;

static int A[MATRIX_SIZE][MATRIX_SIZE];
static int B[MATRIX_SIZE][MATRIX_SIZE];
static int C[MATRIX_SIZE][MATRIX_SIZE];
static int arr[ARRAY_SIZE];
static int result_arr[ARRAY_SIZE];

static unsigned int seed = 12345;
static unsigned int pseudo_rand() {
    seed ^= seed << 13; seed ^= seed >> 17; seed ^= seed << 5;
    return seed;
}

void test_matrix_multiply() {
    int i, j, k;
    for (i = 0; i < MATRIX_SIZE; i++)
        for (j = 0; j < MATRIX_SIZE; j++) {
            A[i][j] = (i * MATRIX_SIZE + j) % 100;
            B[i][j] = (i + j) % 100; C[i][j] = 0;
        }
    for (i = 0; i < MATRIX_SIZE; i++)
        for (k = 0; k < MATRIX_SIZE; k++)
            for (j = 0; j < MATRIX_SIZE; j++)
                C[i][j] += A[i][k] * B[k][j];
    volatile int sink = C[MATRIX_SIZE/2][MATRIX_SIZE/2]; (void)sink;
    printf("[1] Matrix Multiply done. C[64][64] = %d\n", C[MATRIX_SIZE/2][MATRIX_SIZE/2]);
}

void test_sequential_access() {
    int i; long sum = 0;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 1000;
    for (i = 0; i < ARRAY_SIZE; i++) sum += arr[i];
    for (i = 0; i < ARRAY_SIZE; i++) result_arr[i] = arr[i] * 2;
    printf("[2] Sequential Access done. Sum = %ld\n", sum);
}

void test_random_access() {
    int i; long sum = 0; unsigned int idx;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i;
    for (i = 0; i < RANDOM_ITERS; i++) { idx = pseudo_rand() % ARRAY_SIZE; sum += arr[idx]; }
    for (i = 0; i < RANDOM_ITERS; i++) { idx = pseudo_rand() % ARRAY_SIZE; arr[idx] = (int)(sum % 1000); }
    printf("[3] Random Access done. Sum = %ld\n", sum);
}

void test_strided_access() {
    int i; long sum = 0;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 500;
    for (i = 0; i < ARRAY_SIZE; i += STRIDE) sum += arr[i];
    for (i = 0; i < ARRAY_SIZE; i += STRIDE) arr[i] = (int)(sum % 1000);
    printf("[4] Strided Access done (stride=%d). Sum = %ld\n", STRIDE, sum);
}

void test_linked_list() {
    int i; long sum = 0;
    Node* nodes = (Node*)malloc(LIST_SIZE * sizeof(Node));
    if (!nodes) { printf("[5] malloc failed\n"); return; }
    for (i = 0; i < LIST_SIZE - 1; i++) { nodes[i].value = i; nodes[i].next = &nodes[i+1]; }
    nodes[LIST_SIZE-1].value = LIST_SIZE-1; nodes[LIST_SIZE-1].next = NULL;
    for (i = LIST_SIZE-1; i > 0; i--) {
        unsigned int j = pseudo_rand() % (i+1);
        Node* tmp = nodes[i].next; nodes[i].next = nodes[j].next; nodes[j].next = tmp;
    }
    Node* cur = &nodes[0];
    while (cur != NULL) { sum += cur->value; cur = cur->next; }
    printf("[5] Linked List Traversal done. Sum = %ld\n", sum);
    free(nodes);
}

void test_hot_loop() {
    int i, j; long sum = 0; int hot[512];
    for (i = 0; i < 512; i++) hot[i] = i;
    for (j = 0; j < 200; j++) for (i = 0; i < 512; i++) sum += hot[i];
    printf("[6] Hot Loop done. Sum = %ld\n", sum);
}

void test_memory_copy() {
    int i;
    for (i = 0; i < ARRAY_SIZE; i++) arr[i] = i % 256;
    for (i = 0; i < ARRAY_SIZE; i++) result_arr[i] = arr[i];
    int ok = 1;
    for (i = 0; i < 100; i++) if (result_arr[i] != arr[i]) { ok = 0; break; }
    printf("[7] Memory Copy done. Verify: %s\n", ok ? "PASS" : "FAIL");
}

int main() {
    printf("==============================================\n");
    printf("  COA Cache Benchmark\n");
    printf("==============================================\n\n");
    printf("--- Starting Benchmark Suite ---\n\n");
    test_sequential_access();
    test_hot_loop();
    test_strided_access();
    test_matrix_multiply();
    test_memory_copy();
    test_random_access();
    test_linked_list();
    printf("\n--- Benchmark Complete ---\n");
    printf("Check m5out/stats.txt for cache metrics.\n");
    return 0;
}

Compilation & Run Commands

Terminal Output — Single Level

Terminal Output — Multi Level

royce@fedora ~/gem5

$ ./build/RISCV/gem5.opt multi_level_cache.py -- ./benchmark
warn: Base 10 memory/cache sizes cast to base 2 (16GiB, 64KiB, 512KiB, 2MiB)
info: RVV enabled, VLEN = 256 bits, ELEN = 64 bits
Starting simulation (Multi-Level Cache)...
[2] Sequential Access done. Sum = 32610880
[6] Hot Loop done. Sum = 26163200
[4] Strided Access done (stride=16). Sum = 1015460
[1] Matrix Multiply done. C[64][64] = 295312
[7] Memory Copy done. Verify: PASS
[3] Random Access done. Sum = 1638151029
[5] Linked List Traversal done. Sum = 9560438
--- Benchmark Complete ---
Exiting @ tick 172574311500 because exiting with last active thread context

Configuration	Cycles	Improvement
Single Level (L1 only)	359,111,398	baseline
Multi Level (L1+L2+L3)	345,148,623	▼ 3.89% faster

Cache Level	Metric	Single Level	Multi Level
L1 I-Cache	Overall Miss Rate	0.0001%	0.0001%
L1 D-Cache	Overall Miss Rate	0.41%	0.41%
L1 D-Cache	Read Miss Rate	0.22%	0.22%
L1 D-Cache	Write Miss Rate	1.31%	1.31%
L2 Cache	Overall Miss Rate	N/A	14.26%
L2 Cache	Hit Rate (caught L1 misses)	N/A	85.74%
L3 Cache	Overall Miss Rate	N/A	61.87%
L3 Cache	Hit Rate (caught L2 misses)	N/A	38.13%

Cache	Metric	Single Level (ticks)	Multi Level (ticks)	Reduction
L1 D-Cache	Avg Miss Latency	59,598	14,144	▼ 76.27%
L1 D-Cache	Read Miss Latency	59,704	7,035	▼ 88.22%
L1 D-Cache	Write Miss Latency	59,508	20,237	▼ 66.00%
L2 Cache	Avg Miss Latency	N/A	55,139	—
L3 Cache	Avg Miss Latency	N/A	68,105	—

Configuration	Calculation	AMAT
Single Level	2 + (0.004051 × 59,598)	~243.5 cycles
Multi Level	2 + (0.004051 × 14,144)	~59.3 cycles

Multi-Level Cache Performance Analysis

single_level_cache.py

multi_level_cache.py

benchmark.c

Compilation & Run Commands

Terminal Output — Single Level

Terminal Output — Multi Level

Results — Execution Cycles

Results — Cache Miss Rates

Results — Average Miss Latency

AMAT

Key Findings