From 2af9dedfa34c51c00f5e172fb1b5d7df684954b4 Mon Sep 17 00:00:00 2001 From: Bruno Levy Date: Sat, 7 Oct 2023 11:53:15 +0200 Subject: [PATCH 01/23] Added links to libraries to be ported --- LiteX/software/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/LiteX/software/README.md b/LiteX/software/README.md index bd6cefe6..d222c7d0 100644 --- a/LiteX/software/README.md +++ b/LiteX/software/README.md @@ -12,3 +12,12 @@ This directory contains libraries and software packages for LiteX SoCs: - [Doom](Doom/): a port of Doom for LiteOS - [Tagl](Tagl/): a 3D software renderer (that I wrote in the 90s) ported to LiteOS - [Libs](Libs/): common libraries (ELF support, OLED screen, framebuffer, Dear ImGui port, stdio adapter) + + +Links, stuff to port +-------------------- + +- [dos-like](https://github.com/mattiasgustavsson/dos-like) +- [tiny-gl](https://github.com/C-Chads/tinygl) +- [tcc-riscv](https://github.com/sellicott/tcc-riscv32) +- [Bubble Universe](https://stardot.org.uk/forums/viewtopic.php?t=25833&sid=33182a6ffa6f84b08bb6f52cae2ad35d) \ No newline at end of file From 9b663f77cb69ef70370d126ddd2858f992cbd903 Mon Sep 17 00:00:00 2001 From: Bruno Levy Date: Mon, 23 Oct 2023 20:51:50 +0200 Subject: [PATCH 02/23] Added Andy Sloane's CORDIC-based donut demo --- FemtoRV/FIRMWARE/EXAMPLES/donut.c | 173 ++++++++++++++++++ .../FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c | 173 ++++++++++++++++++ 2 files changed, 346 insertions(+) create mode 100644 FemtoRV/FIRMWARE/EXAMPLES/donut.c create mode 100644 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c diff --git a/FemtoRV/FIRMWARE/EXAMPLES/donut.c b/FemtoRV/FIRMWARE/EXAMPLES/donut.c new file mode 100644 index 00000000..6af0cf4e --- /dev/null +++ b/FemtoRV/FIRMWARE/EXAMPLES/donut.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include + +#define debug(...) +//#define debug printf + +// torus radii and distance from camera +// these are pretty baked-in to other constants now, so it probably won't work +// if you change them too much. +const int dz = 5, r1 = 1, r2 = 2; + +// "Magic circle algorithm"? DDA? I've seen this formulation in a few places; +// first in Hal Chamberlain's Musical Applications of Microprocessors, but not +// sure what to call it, or how to justify it theoretically. It seems to +// correctly rotate around a point "near" the origin, without losing magnitude +// over long periods of time, as long as there are enough bits of precision in x +// and y. I use 14 bits here. +#define R(s,x,y) x-=(y>>s); y+=(x>>s) + +// CORDIC algorithm to find magnitude of |x,y| by rotating the x,y vector onto +// the x axis. This also brings vector (x2,y2) along for the ride, and writes +// back to x2 -- this is used to rotate the lighting vector from the normal of +// the torus surface towards the camera, and thus determine the lighting amount. +// We only need to keep one of the two lighting normal coordinates. +int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) { + int x2 = *x2_; + if (x < 0) { // start in right half-plane + x = -x; + x2 = -x2; + } + for (int i = 0; i < 8; i++) { + int t = x; + int t2 = x2; + if (y < 0) { + x -= y >> i; + y += t >> i; + x2 -= y2 >> i; + y2 += t2 >> i; + } else { + x += y >> i; + y -= t >> i; + x2 += y2 >> i; + y2 -= t2 >> i; + } + } + // divide by 0.625 as a cheap approximation to the 0.607 scaling factor factor + // introduced by this algorithm (see https://en.wikipedia.org/wiki/CORDIC) + *x2_ = (x2 >> 1) + (x2 >> 3); + return (x >> 1) + (x >> 3); +} + +void main() { + // high-precision rotation directions, sines and cosines and their products + int16_t sB = 0, cB = 16384; + int16_t sA = 11583, cA = 11583; + int16_t sAsB = 0, cAsB = 0; + int16_t sAcB = 11583, cAcB = 11583; + + for (;;) { + int x1_16 = cAcB << 2; + + // yes this is a multiply but dz is 5 so it's (sb + (sb<<2)) >> 6 effectively + int p0x = dz * sB >> 6; + int p0y = dz * sAcB >> 6; + int p0z = -dz * cAcB >> 6; + + const int r1i = r1*256; + const int r2i = r2*256; + + int niters = 0; + int nnormals = 0; + int16_t yincC = (cA >> 6) + (cA >> 5); // 12*cA >> 8; + int16_t yincS = (sA >> 6) + (sA >> 5); // 12*sA >> 8; + int16_t xincX = (cB >> 7) + (cB >> 6); // 6*cB >> 8; + int16_t xincY = (sAsB >> 7) + (sAsB >> 6); // 6*sAsB >> 8; + int16_t xincZ = (cAsB >> 7) + (cAsB >> 6); // 6*cAsB >> 8; + int16_t ycA = -((cA >> 1) + (cA >> 4)); // -12 * yinc1 = -9*cA >> 4; + int16_t ysA = -((sA >> 1) + (sA >> 4)); // -12 * yinc2 = -9*sA >> 4; + //int dmin = INT_MAX, dmax = -INT_MAX; + for (int j = 0; j < 23; j++, ycA += yincC, ysA += yincS) { + int xsAsB = (sAsB >> 4) - sAsB; // -40*xincY + int xcAsB = (cAsB >> 4) - cAsB; // -40*xincZ; + + int16_t vxi14 = (cB >> 4) - cB - sB; // -40*xincX - sB; + int16_t vyi14 = ycA - xsAsB - sAcB; + int16_t vzi14 = ysA + xcAsB + cAcB; + + for (int i = 0; i < 79; i++, vxi14 += xincX, vyi14 -= xincY, vzi14 += xincZ) { + int t = 512; // (256 * dz) - r2i - r1i; + + int16_t px = p0x + (vxi14 >> 5); // assuming t = 512, t*vxi>>8 == vxi<<1 + int16_t py = p0y + (vyi14 >> 5); + int16_t pz = p0z + (vzi14 >> 5); + debug("pxyz (%+4d,%+4d,%+4d)\n", px, py, pz); + int16_t lx0 = sB >> 2; + int16_t ly0 = sAcB - cA >> 2; + int16_t lz0 = -cAcB - sA >> 2; + for (;;) { + int t0, t1, t2, d; + int16_t lx = lx0, ly = ly0, lz = lz0; + debug("[%2d,%2d] (px, py) = (%d, %d), (lx, ly) = (%d, %d) -> ", j, i, px, py, lx, ly); + t0 = length_cordic(px, py, &lx, ly); + debug("t0=%d (lx', ly') = (%d, %d)\n", t0, lx, ly); + t1 = t0 - r2i; + t2 = length_cordic(pz, t1, &lz, lx); + d = t2 - r1i; + t += d; + + if (t > 8*256) { + putchar(' '); + break; + } else if (d < 2) { + int N = lz >> 9; + putchar(".,-~:;!*=#$@"[N > 0 ? N < 12 ? N : 11 : 0]); + nnormals++; + break; + } + // todo: shift and add version of this + + /* + if (d < dmin) dmin = d; + if (d > dmax) dmax = d; + px += d*vxi14 >> 14; + py += d*vyi14 >> 14; + pz += d*vzi14 >> 14; + */ + { + // 11x1.14 fixed point 3x parallel multiply + // only 16 bit registers needed; starts from highest bit to lowest + // d is about 2..1100, so 11 bits are sufficient + int16_t dx = 0, dy = 0, dz = 0; + int16_t a = vxi14, b = vyi14, c = vzi14; + while (d) { + if (d&1024) { + dx += a; + dy += b; + dz += c; + } + d = (d&1023) << 1; + a >>= 1; + b >>= 1; + c >>= 1; + } + // we already shifted down 10 bits, so get the last four + px += dx >> 4; + py += dy >> 4; + pz += dz >> 4; + } + + niters++; + } + } + puts(""); + } + printf("%d iterations %d lit pixels\x1b[K", niters, nnormals); +// fflush(stdout); + + // rotate sines, cosines, and products thereof + // this animates the torus rotation about two axes + R(5, cA, sA); + R(5, cAsB, sAsB); + R(5, cAcB, sAcB); + R(6, cB, sB); + R(6, cAcB, cAsB); + R(6, sAcB, sAsB); + +// usleep(15000); + printf("\r\x1b[23A"); + } +} diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c new file mode 100644 index 00000000..6af0cf4e --- /dev/null +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include + +#define debug(...) +//#define debug printf + +// torus radii and distance from camera +// these are pretty baked-in to other constants now, so it probably won't work +// if you change them too much. +const int dz = 5, r1 = 1, r2 = 2; + +// "Magic circle algorithm"? DDA? I've seen this formulation in a few places; +// first in Hal Chamberlain's Musical Applications of Microprocessors, but not +// sure what to call it, or how to justify it theoretically. It seems to +// correctly rotate around a point "near" the origin, without losing magnitude +// over long periods of time, as long as there are enough bits of precision in x +// and y. I use 14 bits here. +#define R(s,x,y) x-=(y>>s); y+=(x>>s) + +// CORDIC algorithm to find magnitude of |x,y| by rotating the x,y vector onto +// the x axis. This also brings vector (x2,y2) along for the ride, and writes +// back to x2 -- this is used to rotate the lighting vector from the normal of +// the torus surface towards the camera, and thus determine the lighting amount. +// We only need to keep one of the two lighting normal coordinates. +int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) { + int x2 = *x2_; + if (x < 0) { // start in right half-plane + x = -x; + x2 = -x2; + } + for (int i = 0; i < 8; i++) { + int t = x; + int t2 = x2; + if (y < 0) { + x -= y >> i; + y += t >> i; + x2 -= y2 >> i; + y2 += t2 >> i; + } else { + x += y >> i; + y -= t >> i; + x2 += y2 >> i; + y2 -= t2 >> i; + } + } + // divide by 0.625 as a cheap approximation to the 0.607 scaling factor factor + // introduced by this algorithm (see https://en.wikipedia.org/wiki/CORDIC) + *x2_ = (x2 >> 1) + (x2 >> 3); + return (x >> 1) + (x >> 3); +} + +void main() { + // high-precision rotation directions, sines and cosines and their products + int16_t sB = 0, cB = 16384; + int16_t sA = 11583, cA = 11583; + int16_t sAsB = 0, cAsB = 0; + int16_t sAcB = 11583, cAcB = 11583; + + for (;;) { + int x1_16 = cAcB << 2; + + // yes this is a multiply but dz is 5 so it's (sb + (sb<<2)) >> 6 effectively + int p0x = dz * sB >> 6; + int p0y = dz * sAcB >> 6; + int p0z = -dz * cAcB >> 6; + + const int r1i = r1*256; + const int r2i = r2*256; + + int niters = 0; + int nnormals = 0; + int16_t yincC = (cA >> 6) + (cA >> 5); // 12*cA >> 8; + int16_t yincS = (sA >> 6) + (sA >> 5); // 12*sA >> 8; + int16_t xincX = (cB >> 7) + (cB >> 6); // 6*cB >> 8; + int16_t xincY = (sAsB >> 7) + (sAsB >> 6); // 6*sAsB >> 8; + int16_t xincZ = (cAsB >> 7) + (cAsB >> 6); // 6*cAsB >> 8; + int16_t ycA = -((cA >> 1) + (cA >> 4)); // -12 * yinc1 = -9*cA >> 4; + int16_t ysA = -((sA >> 1) + (sA >> 4)); // -12 * yinc2 = -9*sA >> 4; + //int dmin = INT_MAX, dmax = -INT_MAX; + for (int j = 0; j < 23; j++, ycA += yincC, ysA += yincS) { + int xsAsB = (sAsB >> 4) - sAsB; // -40*xincY + int xcAsB = (cAsB >> 4) - cAsB; // -40*xincZ; + + int16_t vxi14 = (cB >> 4) - cB - sB; // -40*xincX - sB; + int16_t vyi14 = ycA - xsAsB - sAcB; + int16_t vzi14 = ysA + xcAsB + cAcB; + + for (int i = 0; i < 79; i++, vxi14 += xincX, vyi14 -= xincY, vzi14 += xincZ) { + int t = 512; // (256 * dz) - r2i - r1i; + + int16_t px = p0x + (vxi14 >> 5); // assuming t = 512, t*vxi>>8 == vxi<<1 + int16_t py = p0y + (vyi14 >> 5); + int16_t pz = p0z + (vzi14 >> 5); + debug("pxyz (%+4d,%+4d,%+4d)\n", px, py, pz); + int16_t lx0 = sB >> 2; + int16_t ly0 = sAcB - cA >> 2; + int16_t lz0 = -cAcB - sA >> 2; + for (;;) { + int t0, t1, t2, d; + int16_t lx = lx0, ly = ly0, lz = lz0; + debug("[%2d,%2d] (px, py) = (%d, %d), (lx, ly) = (%d, %d) -> ", j, i, px, py, lx, ly); + t0 = length_cordic(px, py, &lx, ly); + debug("t0=%d (lx', ly') = (%d, %d)\n", t0, lx, ly); + t1 = t0 - r2i; + t2 = length_cordic(pz, t1, &lz, lx); + d = t2 - r1i; + t += d; + + if (t > 8*256) { + putchar(' '); + break; + } else if (d < 2) { + int N = lz >> 9; + putchar(".,-~:;!*=#$@"[N > 0 ? N < 12 ? N : 11 : 0]); + nnormals++; + break; + } + // todo: shift and add version of this + + /* + if (d < dmin) dmin = d; + if (d > dmax) dmax = d; + px += d*vxi14 >> 14; + py += d*vyi14 >> 14; + pz += d*vzi14 >> 14; + */ + { + // 11x1.14 fixed point 3x parallel multiply + // only 16 bit registers needed; starts from highest bit to lowest + // d is about 2..1100, so 11 bits are sufficient + int16_t dx = 0, dy = 0, dz = 0; + int16_t a = vxi14, b = vyi14, c = vzi14; + while (d) { + if (d&1024) { + dx += a; + dy += b; + dz += c; + } + d = (d&1023) << 1; + a >>= 1; + b >>= 1; + c >>= 1; + } + // we already shifted down 10 bits, so get the last four + px += dx >> 4; + py += dy >> 4; + pz += dz >> 4; + } + + niters++; + } + } + puts(""); + } + printf("%d iterations %d lit pixels\x1b[K", niters, nnormals); +// fflush(stdout); + + // rotate sines, cosines, and products thereof + // this animates the torus rotation about two axes + R(5, cA, sA); + R(5, cAsB, sAsB); + R(5, cAcB, sAcB); + R(6, cB, sB); + R(6, cAcB, cAsB); + R(6, sAcB, sAsB); + +// usleep(15000); + printf("\r\x1b[23A"); + } +} From 0bac91352c80ffc16a8696a132f3325086b38729 Mon Sep 17 00:00:00 2001 From: Bruno Levy Date: Mon, 30 Oct 2023 08:16:48 +0100 Subject: [PATCH 03/23] Added provenance info for donut.c New version of donut with nice "rendering" and stats --- FemtoRV/FIRMWARE/EXAMPLES/donut.c | 4 + .../FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c | 13 +- .../FROM_BLINKER_TO_RISCV/FIRMWARE/donut2.c | 427 ++++++++++++++++++ .../FROM_BLINKER_TO_RISCV/pipeline9.v | 2 +- .../FROM_BLINKER_TO_RISCV/run_verilator.sh | 2 +- 5 files changed, 444 insertions(+), 4 deletions(-) create mode 100644 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut2.c diff --git a/FemtoRV/FIRMWARE/EXAMPLES/donut.c b/FemtoRV/FIRMWARE/EXAMPLES/donut.c index 6af0cf4e..3958b99e 100644 --- a/FemtoRV/FIRMWARE/EXAMPLES/donut.c +++ b/FemtoRV/FIRMWARE/EXAMPLES/donut.c @@ -1,3 +1,7 @@ + +// donut.c by Andy Sloane (@a1k0n) +// https://gist.github.com/a1k0n/8ea6516b4946ab36348fb61703dc3194 + #include #include #include diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c index 6af0cf4e..ef18eebb 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut.c @@ -1,9 +1,14 @@ +// donut.c by Andy Sloane (@a1k0n) +// https://gist.github.com/a1k0n/8ea6516b4946ab36348fb61703dc3194 + #include #include #include #include #include +#define WITH_RV32M + #define debug(...) //#define debug printf @@ -120,13 +125,17 @@ void main() { } // todo: shift and add version of this + /* if (d < dmin) dmin = d; if (d > dmax) dmax = d; + */ + +#ifdef WITH_RV32M px += d*vxi14 >> 14; py += d*vyi14 >> 14; pz += d*vzi14 >> 14; - */ +#else { // 11x1.14 fixed point 3x parallel multiply // only 16 bit registers needed; starts from highest bit to lowest @@ -149,7 +158,7 @@ void main() { py += dy >> 4; pz += dz >> 4; } - +#endif niters++; } } diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut2.c b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut2.c new file mode 100644 index 00000000..b7ec422e --- /dev/null +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/donut2.c @@ -0,0 +1,427 @@ +// donut.c by Andy Sloane (@a1k0n) +// https://gist.github.com/a1k0n/8ea6516b4946ab36348fb61703dc3194 +// Bruno Levy: added ANSI "pseudo-graphics", and RISC-V statistics + +#define CPU_NAME "TordBoyau ULX3S" // Name of your CPU and FPGA board +#define MHZ 95 // Frequency (without a timer we cannot guess) +#define USE_MUL // Define if you support RV32M + +// #define PRECISE // Define for a more accurate result (but it costs a bit) +#define START_FRAMES 20 // Number of frames without display + // (for accurate CPI/MIPS measurements) + +#include +#include +#include +#include +#include + +// 0 15 31 47 63 79 96 112 127 143 159 175 191 207 223 240 255 + +const char* colormap[34] = { + "0", + "8;5;232", + "8;5;233", + "8;5;234", + "8;5;235", + "8;5;236", + "8;5;237", + "8;5;238", + "8;5;239", + "8;5;240", + "8;5;241", + "8;5;242", + "8;5;243", + "8;5;244", + "8;5;245", + "8;5;246", + "8;5;247", + "8;5;248", + "8;5;249", + "8;5;250", + "8;5;251", + "8;5;252", + "8;5;253", + "8;5;254", + "8;5;255", + "7", + "8;5;16", + "8;5;17", + "8;5;18", + "8;5;19", + "8;5;20", + "8;5;21", + "8;5;22", + "8;5;23", +}; + +int prev_color1=0; +int prev_color2=0; + +char scanline[80]; + +#ifdef __linux__ + +uint64_t my_rdcycle() { + return 0; +} + +uint64_t my_rdinstret() { + return 0; +} + +#else + +uint64_t my_rdcycle() { + uint64_t result; + uint32_t a0,a1,t0; + { + __asm__ __volatile__ ("rdcycleh %0" : "=r" (a1)); + __asm__ __volatile__ ("rdcycle %0" : "=r" (a0)); + __asm__ __volatile__ ("rdcycleh %0" : "=r" (t0)); + } while(t0 != a1); + + return ((uint64_t)a1 << 32) | a0; +} + +uint64_t my_rdinstret() { + uint64_t result; + uint32_t a0,a1,t0; + { + __asm__ __volatile__ ("rdinstreth %0" : "=r" (a1)); + __asm__ __volatile__ ("rdinstret %0" : "=r" (a0)); + __asm__ __volatile__ ("rdinstreth %0" : "=r" (t0)); + } while(t0 != a1); + + return ((uint64_t)a1 << 32) | a0; +} + +#endif + +uint64_t stats_cycles_init = 0; +uint64_t stats_instructions_init = 0; +uint64_t stats_cycles = 0; +uint64_t stats_instructions = 0; +int stats_CPI_times_1000 = 0; + +void stats_start() { + stats_cycles_init = my_rdcycle(); + stats_instructions_init = my_rdinstret(); +} + +void stats_end() { + stats_cycles = my_rdcycle() - stats_cycles_init; + stats_instructions = my_rdinstret() - stats_instructions_init; + if(stats_cycles==0) { + stats_cycles++; + } + if(stats_instructions==0) { + stats_instructions++; + } + stats_CPI_times_1000 = (int)((stats_cycles * 1000)/stats_instructions); +} + +// Print "fixed point" number (integer/1000) +static void printk(uint64_t kx) { + int intpart = (int)(kx / 1000); + int fracpart = (int)(kx % 1000); + printf("%d.",intpart); + if(fracpart<100) { + printf("0"); + } + if(fracpart<10) { + printf("0"); + } + printf("%d",fracpart); +} + +static inline void setcolors(int fg, int bg) { + printf("\033[4%s;3%sm",colormap[bg],colormap[fg]); +} + +static inline void setpixel(int x, int y, int color) { + if(y&1){ + int color1 = scanline[x]; + int color2 = color; + if(color1 == color2) { + if(prev_color1 == color1) { + putchar(' '); + } else { + printf("\033[4%sm ",colormap[color1]); + prev_color1 = color1; + } + } else { + if(prev_color1 != color1 && prev_color2 != color2) { + printf("\033[4%s;3%sm",colormap[color1],colormap[color2]); + prev_color1 = color1; + prev_color2 = color2; + } else if(prev_color1 != color1) { + printf("\033[4%sm",colormap[color1]); + prev_color1 = color1; + } else if(prev_color2 != color2) { + printf("\033[3%sm",colormap[color2]); + prev_color2 = color2; + } + printf("\u2583"); + } + } else { + scanline[x] = color; + } +} + +#define debug(...) +//#define debug printf + +// torus radii and distance from camera +// these are pretty baked-in to other constants now, so it probably won't work +// if you change them too much. +const int dz = 5, r1 = 1, r2 = 2; + +// "Magic circle algorithm"? DDA? I've seen this formulation in a few places; +// first in Hal Chamberlain's Musical Applications of Microprocessors, but not +// sure what to call it, or how to justify it theoretically. It seems to +// correctly rotate around a point "near" the origin, without losing magnitude +// over long periods of time, as long as there are enough bits of precision in x +// and y. I use 14 bits here. +#define R(s,x,y) x-=(y>>s); y+=(x>>s) + +// CORDIC algorithm to find magnitude of |x,y| by rotating the x,y vector onto +// the x axis. This also brings vector (x2,y2) along for the ride, and writes +// back to x2 -- this is used to rotate the lighting vector from the normal of +// the torus surface towards the camera, and thus determine the lighting amount. +// We only need to keep one of the two lighting normal coordinates. +int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) { + +#ifdef PRECISE + #define NIT 10 +#else + #define NIT 5 +#endif + + int x2 = *x2_; + if (x < 0) { // start in right half-plane + x = -x; + x2 = -x2; + } + for (int i = 0; i> i; + y += t >> i; + x2 -= y2 >> i; + y2 += t2 >> i; + } else { + x += y >> i; + y -= t >> i; + x2 += y2 >> i; + y2 -= t2 >> i; + } + } + // divide by 0.625 as a cheap approximation to the 0.607 scaling factor factor + // introduced by this algorithm (see https://en.wikipedia.org/wiki/CORDIC) + *x2_ = (x2 >> 1) + (x2 >> 3); + return (x >> 1) + (x >> 3) + #ifdef PRECISE + - (x >> 6) // get nrearer to 0.607 [Inigo Quilez] + #endif + ; +} + +int main() { + + printf( "\033[48;5;16m" // set background color black + "\033[38;5;15m" // set foreground color white + "\033[H" // home + "\033[?25l" // hide cursor + "\033[2J"); // clear screen + + int frame = 0; + + // high-precision rotation directions, sines and cosines and their products + int16_t sB = 0, cB = 16384; + int16_t sA = 11583, cA = 11583; + int16_t sAsB = 0, cAsB = 0; + int16_t sAcB = 11583, cAcB = 11583; + + int accurate_CPI_x_1000; + int accurate_MIPS_x_1000; + int CPI_x_1000; + + stats_start(); + + for (;;) { + + int display_on = (frame > START_FRAMES); + if(display_on) { + stats_start(); + } + + int x1_16 = cAcB << 2; + + // yes this is a multiply but dz is 5 so it's (sb + (sb<<2)) >> 6 effectively + int p0x = dz * sB >> 6; + int p0y = dz * sAcB >> 6; + int p0z = -dz * cAcB >> 6; + + const int r1i = r1*256; + const int r2i = r2*256; + + int niters = 0; + int nnormals = 0; + int16_t yincC = (cA >> 6) + (cA >> 5); // 12*cA >> 8; + int16_t yincS = (sA >> 6) + (sA >> 5); // 12*sA >> 8; + int16_t xincX = (cB >> 7) + (cB >> 6); // 6*cB >> 8; + int16_t xincY = (sAsB >> 7) + (sAsB >> 6); // 6*sAsB >> 8; + int16_t xincZ = (cAsB >> 7) + (cAsB >> 6); // 6*cAsB >> 8; + int16_t ycA = -((cA >> 1) + (cA >> 4)); // -12 * yinc1 = -9*cA >> 4; + int16_t ysA = -((sA >> 1) + (sA >> 4)); // -12 * yinc2 = -9*sA >> 4; + //int dmin = INT_MAX, dmax = -INT_MAX; + + int xsAsB = (sAsB >> 4) - sAsB; // -40*xincY + int xcAsB = (cAsB >> 4) - cAsB; // -40*xincZ; + + + for (int j = 0; j < 46; j++, ycA += yincC>>1, ysA += yincS>>1) { + + int16_t vxi14 = (cB >> 4) - cB - sB; // -40*xincX - sB; + int16_t vyi14 = ycA - xsAsB - sAcB; + int16_t vzi14 = ysA + xcAsB + cAcB; + + for (int i = 0; i < 79; i++, vxi14 += xincX, vyi14 -= xincY, vzi14 += xincZ) { + int t = 512; // (256 * dz) - r2i - r1i; + + int16_t px = p0x + (vxi14 >> 5); // assuming t = 512, t*vxi>>8 == vxi<<1 + int16_t py = p0y + (vyi14 >> 5); + int16_t pz = p0z + (vzi14 >> 5); + debug("pxyz (%+4d,%+4d,%+4d)\n", px, py, pz); + int16_t lx0 = sB >> 2; + int16_t ly0 = sAcB - cA >> 2; + int16_t lz0 = -cAcB - sA >> 2; + for (;;) { + int t0, t1, t2, d; + int16_t lx = lx0, ly = ly0, lz = lz0; + debug("[%2d,%2d] (px, py) = (%d, %d), (lx, ly) = (%d, %d) -> ", j, i, px, py, lx, ly); + t0 = length_cordic(px, py, &lx, ly); + debug("t0=%d (lx', ly') = (%d, %d)\n", t0, lx, ly); + t1 = t0 - r2i; + t2 = length_cordic(pz, t1, &lz, lx); + d = t2 - r1i; + t += d; + + if (t > 8*256) { + // putchar(' '); + int N = (((j-frame)>>3)^(((i+frame)>>3)))&1; + if(display_on) setpixel(i,j,(N<<2)+26); + break; + } else if (d < 2) { + int N = lz >> 8; + // putchar(".,-~:;!*=#$@"[N > 0 ? N < 12 ? N : 11 : 0]); + N = N > 0 ? N < 26 ? N : 25 : 0; + if(display_on) setpixel(i,j,N); + nnormals++; + break; + } + // todo: shift and add version of this + + /* + if (d < dmin) dmin = d; + if (d > dmax) dmax = d; + */ + +#ifdef USE_MUL + px += d*vxi14 >> 14; + py += d*vyi14 >> 14; + pz += d*vzi14 >> 14; +#else + { + // 11x1.14 fixed point 3x parallel multiply + // only 16 bit registers needed; starts from highest bit to lowest + // d is about 2..1100, so 11 bits are sufficient + int16_t dx = 0, dy = 0, dz = 0; + int16_t a = vxi14, b = vyi14, c = vzi14; + while (d) { + if (d&1024) { + dx += a; + dy += b; + dz += c; + } + d = (d&1023) << 1; + a >>= 1; + b >>= 1; + c >>= 1; + } + // we already shifted down 10 bits, so get the last four + px += dx >> 4; + py += dy >> 4; + pz += dz >> 4; + } +#endif + niters++; + } + } + if(display_on && (j&1)) puts(""); + } + if(display_on) printf("\033[0m"); // reset colors + + stats_end(); + + if(frame == START_FRAMES) { + accurate_CPI_x_1000 = stats_CPI_times_1000; + accurate_MIPS_x_1000 = (MHZ * 1000000) / accurate_CPI_x_1000; + } + + CPI_x_1000 = stats_CPI_times_1000; + + uint64_t FPS_num = (uint64_t)(MHZ) * 1000000 * 1000; + uint64_t FPS_denom = stats_cycles; + int FPSx1000 = (int)(FPS_num / FPS_denom); + + setcolors(25,33); +#ifdef USE_MUL + printf("%s RV32IM %dMHz ", CPU_NAME, MHZ); +#else + printf("%s RV32I %dMHz ", CPU_NAME, MHZ); +#endif + + setcolors(25,0); + printf(" "); printk(FPSx1000); printf(" FPS "); + setcolors(0,25); + printf(" "); printk(CPI_x_1000); + printf(" ("); printk(accurate_CPI_x_1000); printf(") CPI "); + setcolors(25,0); + printf(" "); printk(accurate_MIPS_x_1000); printf(" MIPS"); + /* + setcolors(0,25); + printf(" %d iterations ", niters); + setcolors(0,25); + printf(" %d lit pixels ", nnormals); + */ + setcolors(25,0); + printf("\x1b[K"); + +#ifdef __linux__ + fflush(stdout); +#endif + + // rotate sines, cosines, and products thereof + // this animates the torus rotation about two axes + R(5, cA, sA); + R(5, cAsB, sAsB); + R(5, cAcB, sAcB); + R(6, cB, sB); + R(6, cAcB, cAsB); + R(6, sAcB, sAsB); + +#ifdef __linux__ + usleep(15000); +#endif + printf("\r\x1b[23A"); + ++frame; + prev_color1=-1; + prev_color2=-1; + } + + return 0; +} + diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipeline9.v b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipeline9.v index a2d079c6..12ba6877 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipeline9.v +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipeline9.v @@ -9,7 +9,7 @@ `define CONFIG_RAS // return address stack `define CONFIG_GSHARE // gshare branch prediction (or BTFNT if not set) -`define CONFIG_DEBUG // debug mode, displays execution +//`define CONFIG_DEBUG // debug mode, displays execution // See "debugger" section in source // to define breakpoints diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/run_verilator.sh b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/run_verilator.sh index f2e35d47..f6f62286 100755 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/run_verilator.sh +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/run_verilator.sh @@ -1,4 +1,4 @@ -(cd obj_dir; rm -f *.cpp *.o *.a VSOC; make -f VSOC.mk) +(cd obj_dir; rm -f *.cpp *.o *.a VSOC) verilator -CFLAGS '-I../../../FIRMWARE/LIBFEMTORV32 -DSTANDALONE_FEMTOELF' -DBENCH -DBOARD_FREQ=10 -DCPU_FREQ=10 -DPASSTHROUGH_PLL -Wno-fatal \ --top-module SOC -cc -exe sim_main.cpp ../../FIRMWARE/LIBFEMTORV32/femto_elf.c $1 (cd obj_dir; make -f VSOC.mk) From aa9315c6d27bd75ff08bcb9ffe66f09a3f736646 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sat, 6 Jan 2024 21:28:29 +0100 Subject: [PATCH 04/23] fixed #90 (ST_NICCC demo was too large to fit in BRAM) --- Basic/FOMU/FOMU_VGA/vga.v | 4 +- .../ASM_EXAMPLES/mandelbrot_terminal.S | 4 +- FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c | 6 ++- FemtoRV/FIRMWARE/EXAMPLES/mandel_float.c | 2 + FemtoRV/RTL/CONFIGS/icestick_config.v | 2 +- .../BOARDS/run_icestick_show.sh | 9 ++++ .../FROM_BLINKER_TO_RISCV/FIRMWARE/ST_NICCC.c | 46 ++++++++++++++++--- .../FROM_BLINKER_TO_RISCV/emitter_uart.v | 2 +- .../FROM_BLINKER_TO_RISCV/pipelineZ.v | 2 +- .../FROM_BLINKER_TO_RISCV/terminal.sh | 1 + 10 files changed, 62 insertions(+), 16 deletions(-) create mode 100755 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/BOARDS/run_icestick_show.sh diff --git a/Basic/FOMU/FOMU_VGA/vga.v b/Basic/FOMU/FOMU_VGA/vga.v index cd5291cd..ba6f22b3 100644 --- a/Basic/FOMU/FOMU_VGA/vga.v +++ b/Basic/FOMU/FOMU_VGA/vga.v @@ -28,8 +28,8 @@ module vga ( wire pixel_clk; // Choose your video mode here: -`define VGA_MODE_640x480 -//`define VGA_MODE_1024x768 +//`define VGA_MODE_640x480 +`define VGA_MODE_1024x768 //`define VGA_MODE_1280x1024 `ifdef VGA_MODE_640x480 diff --git a/FemtoRV/FIRMWARE/ASM_EXAMPLES/mandelbrot_terminal.S b/FemtoRV/FIRMWARE/ASM_EXAMPLES/mandelbrot_terminal.S index 4b2229cb..a8308003 100644 --- a/FemtoRV/FIRMWARE/ASM_EXAMPLES/mandelbrot_terminal.S +++ b/FemtoRV/FIRMWARE/ASM_EXAMPLES/mandelbrot_terminal.S @@ -87,8 +87,8 @@ exit_Z: li a0,13 call putchar - li a0,10 - call putchar +# li a0,10 +# call putchar add s1,s1,1 add s3,s3,dy diff --git a/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c b/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c index 29479f69..d4a6e5d1 100644 --- a/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c +++ b/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c @@ -6,10 +6,12 @@ int main() { MAX7219_tty_init(); // redirect printf() to led matrix scroller for(;;) { - printf("Hello, RISC-V world \001 \002 \001 \002 "); -// printf("Hello, TelecomNancy ! \001 \002 Best school ! \001 \002 "); +// printf("Hello, RISC-V world \001 \002 \001 \002 "); +// printf("Hello, TelecomNancy ! \001 \002 Best school ! \001 \002 "); // printf("Hello FemtoRV friend !!! \001 \002 \001 \002 "); // printf("Hello, Hackaday \001 \002 Greetings from FemtoRV !!! "); +// printf("Hello, RISC-V world \001 \002 \001 \002 "); + printf("Hello, caf\202 LoOPS ! \001 \002 \001 \002 "); } return 0; } diff --git a/FemtoRV/FIRMWARE/EXAMPLES/mandel_float.c b/FemtoRV/FIRMWARE/EXAMPLES/mandel_float.c index f9baafe0..8170b3ac 100644 --- a/FemtoRV/FIRMWARE/EXAMPLES/mandel_float.c +++ b/FemtoRV/FIRMWARE/EXAMPLES/mandel_float.c @@ -32,6 +32,7 @@ void mandel() { for(int Y=0; Y #include +#else +#include "io.h" #endif - +// when compiling for SPI flash, uncomment to fit some routines in fast BRAM +// (but it does not change much, the bottleneck is ANSI RGB encoding and uart. //#define RV32_FASTCODE __attribute((section(".fastcode"))) #define RV32_FASTCODE +// when compiling for SPI flash, uncomment to enable wireframe mode (but it is ugly +// and it will not fit in BRAM !) +// #define WITH_WIREFRAME + +#ifdef WITH_WIREFRAME int wireframe = 0; +#endif #define MIN(x,y) ((x) < (y) ? (x) : (y)) #define MAX(x,y) ((x) > (y) ? (x) : (y)) + /**********************************************************************************/ /* Graphics routines */ /**********************************************************************************/ @@ -46,13 +56,11 @@ static inline uint8_t map_y(uint8_t y) { return y >> 2; } - void GL_clear() { printf("\033[48;5;16m" // set background color black "\033[2J"); // clear screen } - /* * Set background color using 6x6x6 colorcube codes * see https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences @@ -69,6 +77,7 @@ static inline void GL_setpixel(int x, int y) { printf("\033[%d;%dH ",y,x); // Goto_XY(x1,y) and print space } +#ifdef WITH_WIREFRAME void GL_line(int x1, int y1, int x2, int y2) RV32_FASTCODE; void GL_line(int x1, int y1, int x2, int y2) { int x,y,dx,dy,sy,tmp; @@ -83,7 +92,7 @@ void GL_line(int x1, int y1, int x2, int y2) { y1 = tmp; } - /* Bresenham line drawing. */ + // Bresenham line drawing. dy = y2 - y1; sy = 1; if(dy < 0) { @@ -127,6 +136,7 @@ void GL_line(int x1, int y1, int x2, int y2) { } } } +#endif void GL_fillpoly(int nb_pts, int* points) RV32_FASTCODE; void GL_fillpoly(int nb_pts, int* points) { @@ -163,12 +173,14 @@ void GL_fillpoly(int nb_pts, int* points) { int x2 = points[2*i2]; int y2 = points[2*i2+1]; +#ifdef WITH_WIREFRAME if(wireframe) { if((clockwise > 0) ^ (y2 > y1)) { GL_line(x1,y1,x2,y2); } continue; } +#endif char* x_buffer = ((clockwise > 0) ^ (y2 > y1)) ? x_left : x_right; int dx = x2 - x1; @@ -208,7 +220,10 @@ void GL_fillpoly(int nb_pts, int* points) { } } - if(!wireframe) { +#ifdef WITH_WIREFRAME + if(!wireframe) +#endif + { for(int y = miny; y <= maxy; ++y) { int x1 = x_left[y]; int x2 = x_right[y]; @@ -431,18 +446,35 @@ int read_frame() { int main() { // printf("\x1B[?25l"); // hide cursor - wireframe = 0; + +#ifndef __linux__ + IO_OUT(IO_LEDS,15); +#endif + printf("starting\n"); + +#ifdef WITH_WIREFRAME + wireframe = 0; +#endif + int frame = 0; GL_clear(); for(;;) { spi_reset(); + frame = 0; while(read_frame()) { +#ifdef WITH_WIREFRAME if(wireframe) { GL_clear(); } +#endif #ifdef __linux__ usleep(20000); -#endif +#else + IO_OUT(IO_LEDS,frame); +#endif + ++frame; } +#ifdef WITH_WIREFRAME wireframe = !wireframe; +#endif } } diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/emitter_uart.v b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/emitter_uart.v index adf939f5..25ca7816 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/emitter_uart.v +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/emitter_uart.v @@ -1,7 +1,7 @@ module corescore_emitter_uart #( parameter clk_freq_hz = 0, - parameter baud_rate = 57600) + parameter baud_rate = 1000000) ( input wire i_clk, input wire i_rst, diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipelineZ.v b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipelineZ.v index ee80ce74..e3bdaea2 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipelineZ.v +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/pipelineZ.v @@ -12,7 +12,7 @@ `define CONFIG_RV32M // RV32M instruction set (MUL,DIV,REM) -`define CONFIG_DEBUG // debug mode, displays execution +//`define CONFIG_DEBUG // debug mode, displays execution // See "debugger" section in source // to define breakpoints diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/terminal.sh b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/terminal.sh index 5ef16d1b..c91b3b54 100755 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/terminal.sh +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/terminal.sh @@ -1,6 +1,7 @@ DEVICE=/dev/ttyUSB1 # replace by the terminal used by your device BAUDS=1000000 + # MINITERM exit: ] package: sudo apt-get install python3-serial #miniterm --dtr=0 $DEVICE $BAUDS From 9166a1f14fa74d270b73b8269754d1ac65f10b1d Mon Sep 17 00:00:00 2001 From: Bruno Date: Sat, 6 Jan 2024 21:39:12 +0100 Subject: [PATCH 05/23] added program to display the beginning of the SPI flash --- .../FROM_BLINKER_TO_RISCV/FIRMWARE/read_spiflash.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/read_spiflash.c diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/read_spiflash.c b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/read_spiflash.c new file mode 100644 index 00000000..1d2bfe0d --- /dev/null +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/read_spiflash.c @@ -0,0 +1,14 @@ +#include "io.h" + +#define SPI_FLASH_BASE ((char*)(1 << 23)) + +int main() { + for(int i=0; i<16; ++i) { + IO_OUT(IO_LEDS,i); + int lo = (int)SPI_FLASH_BASE[2*i ]; + int hi = (int)SPI_FLASH_BASE[2*i+1]; + print_hex_digits((hi << 8) | lo,4); // print four hexadecimal digits + printf(" "); + } + printf("\n"); +} From 6c5c727192ef31c4b23a126a787726e7cfc1a27b Mon Sep 17 00:00:00 2001 From: Bruno Date: Sat, 6 Jan 2024 23:32:53 +0100 Subject: [PATCH 06/23] addded sanity check and warning message in FIRMWARE_WORDS, to detect situation with nearly full BRAM --- .../TOOLS/FIRMWARE_WORDS_SRC/firmware_words.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/FemtoRV/FIRMWARE/TOOLS/FIRMWARE_WORDS_SRC/firmware_words.cpp b/FemtoRV/FIRMWARE/TOOLS/FIRMWARE_WORDS_SRC/firmware_words.cpp index d5a645ac..70a9b623 100644 --- a/FemtoRV/FIRMWARE/TOOLS/FIRMWARE_WORDS_SRC/firmware_words.cpp +++ b/FemtoRV/FIRMWARE/TOOLS/FIRMWARE_WORDS_SRC/firmware_words.cpp @@ -442,9 +442,16 @@ int main(int argc, char** argv) { << (RAM_SIZE/4) << " words )" << std::endl; - std::cout << "Occupancy: " << (max_addr*100) / RAM_SIZE - << "%" << std::endl; + int occupancy = (max_addr*100) / RAM_SIZE; + + std::cout << "Occupancy: " << occupancy << "%" << std::endl; + + if(occupancy > 95) { + std::cerr << " ********** WARNING ************ " + << "RAM is almost full, program may crash if stack overflows" + << std::endl; + } if(MAX_ADDR != 0) { std::cout << "testing MAX_ADDR limit: " << MAX_ADDR << std::endl; From 97a27596e069a3966dbce170a387fd1375d85c18 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 7 Jan 2024 07:59:15 +0100 Subject: [PATCH 07/23] added more information about SPI flash and an additional program to display its contents --- .../TUTORIALS/FROM_BLINKER_TO_RISCV/README.md | 94 ++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md index ee78ce5c..787e24b4 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md @@ -2778,6 +2778,13 @@ extracts the parts that are interesting for us and writes them in ASCII hexadeci _Note_ you can invoke `make xxxx.bram.hex` directly, it will invoke the assembler, linker and elf conversion utility for you automatically. +_Note_ on the IceStick, we only have `6kB` of RAM, so only tiny programs will fit. If the compiled +program is larger than `6kB` then you will get an error. A more problematic case is a program that +nearly fills the whole BRAM, then we have nearly no space for the stack, and the stack will overwrite +the rest, putting the CPU in an invalid state, probably frozen. This situation is difficult to understand / +to debug when you encounter it, so `firmware_words` displays a big warning message whenever the generated +code fills more than 95% of the BRAM. + Now you can run the example in simulation and on the device: ``` $ cd .. @@ -3119,10 +3126,93 @@ has 24 bits only, we can save significant resources there: wire [ADDR_WIDTH-1:0] loadstore_addr = rs1 + (isStore ? Simm : Iimm); ``` +The up to date verilog file is avalaible in [step22.v](step22.v). Let us now check +that we are able to access the SPI flash from our processor, with the following +[program](FIRMWARE/read_spiflash.c): +```C +#include "io.h" +#define SPI_FLASH_BASE ((char*)(1 << 23)) +int main() { + for(int i=0; i<16; ++i) { + IO_OUT(IO_LEDS,i); + int lo = (int)SPI_FLASH_BASE[2*i ]; + int hi = (int)SPI_FLASH_BASE[2*i+1]; + print_hex_digits((hi << 8) | lo,4); // print four hexadecimal digits + printf(" "); + } + printf("\n"); +} +``` + +The SPI flash is mapped in memory space, using addresses with bits 23 or 24 set (the +first address, that we call `SPI_FLASH_BASE`, is `1 << 23`). Then we access all individual +bytes, and display them by grouping them into 16-bit words (for each word, the first byte +in memory is the least significant one, because RISC-V follows the little-endian convention). +We have a `print_hex_digits()` function in [FIRMWARE/print.c](FIRMWARE/print.c) that does the job +(the second argument is the number of hex characters we want to print for each number). + +Now compile the program, synthesize the design and send it to the device as follows: + +``` + $ cd FIRMWARE + $ make read_spiflash.bram.hex + $ cd .. + $ BOARDS/run_icestick.sh step22.v + $ ./terminal.sh +``` + +... and you see nothing. While is this so ? The program finished before you started the terminal, +so we were not able to see anything, but you can reset the processor, pushing the invisible reset +button (mentioned in [step 2](README.md#step-2-slower-blinky)). Each time you push the +"button", it will display on the terminal the first 16 words stored in the SPI flash. +On a IceStick, you will see something like: +``` +00FF FF00 AA7E 7E99 0051 0501 0092 6220 4B01 0072 8290 0000 0011 0101 0000 0000 +``` + +Do you have an idea where these values come from ? Remember why there is this SPI flash chip on your FPGA +board: it is where your design is stored. When the FPGA starts, it loads its design from the SPI flash. The +design corresponds to the file `SOC.bin`, that is generated at the end of the `yosys/nextpnr/icepack` pipeline: +- `yosys` transforms your verilog into a "circuit", also called a "netlist" +- then `nextpnr` maps the gates of this circuit to the logical elements of the FPGA, +- and finally `icepack` converts the result into a "binary stream" directly understood by the FPGA. + +Let us examine the 16 first words of the binary stream: + +``` + $ od -x -N 32 SOC.bin +``` + +Then you'll see something like: +``` +0000000 00ff ff00 aa7e 7e99 0051 0501 0092 6220 +0000020 4b01 0072 8290 0000 0011 0101 0000 0000 +0000040 +``` + +and this corresponds to what we have just seen on the terminal, read from the SPI flash chip. +So our CPU can read its own FPGA representation from the SPI flash, like a biologist sequencing his +hown DNA ! While it has a nice and intriguing recursion flavor, it is probably of very little practical +use, but let us take a deeper look at it: the `SOC.bin` file is not very large: + +``` +$ ls -al SOC.bin +-rw-rw-r-- 1 blevy blevy 32220 Jan 7 07:31 SOC.bin +``` + +It weights only `32Kb` or so, and our SPI flash chip has capacity for `2Mb` or so, so there is plenty of room for us ! +The only thing we need to take care of is not overwriting the FPGA configuration (in other words, always start further +away then the size of `SOC.bin`). So we will use a `1Mb` offset for storing our data (you will say we are wasting a lot +of space between `32Kb` and `1Mb` but we shall use that space for something else in subsequent steps of this tutorial). + +**Try this** Create a text file `hello.txt`, send it to the FPGA at the `1Mb` offset using `iceprog -o 1M hello.txt`, write +a program that displays the stored file. To know where to stop, you may need either to decide for a termination character +or to precode the length of the file. + ![](ST_NICCC_tty.png) -OK, so now we are ready to test the new storage that we have. Up to date -verilog file is avalaible in [step22.v](step22.v). What we will do is displaying +OK, so now we are ready to use the new storage that we have for more interesting things. +What we will do is displaying an animation on the terminal. The animation is a demo from the 90's, that streams polygon data to a software polygon renderer. Polygon data is a 640 kB binary file, available from `learn_fpga/FemtoRV/FIRMWARE/EXAMPLES/DATA/scene1.dat` (see other From ca76c6def0f877fe32ab647d1a6fe448873c911e Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 7 Jan 2024 08:10:26 +0100 Subject: [PATCH 08/23] small fixes in the test, and explanations how to send data to the SPI flash on ECP5 boards --- .../TUTORIALS/FROM_BLINKER_TO_RISCV/README.md | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md index 787e24b4..132392cc 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md @@ -3144,7 +3144,7 @@ int main() { } ``` -The SPI flash is mapped in memory space, using addresses with bits 23 or 24 set (the +The SPI flash is mapped in memory space, using addresses with bit 23 set (the first address, that we call `SPI_FLASH_BASE`, is `1 << 23`). Then we access all individual bytes, and display them by grouping them into 16-bit words (for each word, the first byte in memory is the least significant one, because RISC-V follows the little-endian convention). @@ -3200,15 +3200,28 @@ $ ls -al SOC.bin -rw-rw-r-- 1 blevy blevy 32220 Jan 7 07:31 SOC.bin ``` -It weights only `32Kb` or so, and our SPI flash chip has capacity for `2Mb` or so, so there is plenty of room for us ! +It weights only `32KB` or so, and our SPI flash chip has capacity for `4MB`, so there is plenty of room for us ! The only thing we need to take care of is not overwriting the FPGA configuration (in other words, always start further -away then the size of `SOC.bin`). So we will use a `1Mb` offset for storing our data (you will say we are wasting a lot -of space between `32Kb` and `1Mb` but we shall use that space for something else in subsequent steps of this tutorial). +away then the size of `SOC.bin`). So we will use a `1MB` offset for storing our data (you will say we are wasting a lot +of space between `32KB` and `1MB` but we shall use that space for something else in subsequent steps of this tutorial). -**Try this** Create a text file `hello.txt`, send it to the FPGA at the `1Mb` offset using `iceprog -o 1M hello.txt`, write +**Try this** Create a text file `hello.txt`, send it to the FPGA at the `1MB` offset (see below how to do that), write a program that displays the stored file. To know where to stop, you may need either to decide for a termination character or to precode the length of the file. +For ICE40 boards (IceStick, IceBreaker, ...), use: +``` + $ iceprog -o 1M hello.txt +``` + +For ECP5 boards (ULX3S), use: +``` + $ cp hello.txt hello.img + $ ujprog -j flash -f 1048576 hello.img +``` +(using latest version of `ujprog` compiled from [https://github.com/kost/fujprog](https://github.com/kost/fujprog)). + + ![](ST_NICCC_tty.png) OK, so now we are ready to use the new storage that we have for more interesting things. From eb6b524cb3ab45232d0befea6c94750869785d11 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 7 Jan 2024 08:21:25 +0100 Subject: [PATCH 09/23] more details about address decoding when there is mapped SPI flash --- FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md index 132392cc..dab9e63c 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md @@ -2960,7 +2960,11 @@ module MappedSPIFlash( Now the idea is to modify our SOC in such a way that some addresses correspond to the SPI flash. First we need to decide how it will be projected into the memory space of our processor. The -idea is to use bit 23 of memory addresses to select the SPI Flash. +idea is to use bit 23 of memory addresses to select the SPI Flash. Bit 22 is for IO (LEDs, UART). In +addition, for IO, we need to check that bit 23 is zero. And if both bits 23 and 22 are zero, then we +are in BRAM. So our memory space is decomposed into four "quadrants" depending on bits 23 and 22, and +we use three of them. + Then we have the different signals to discriminate the different zones of our memory: ```verilog From 2c849b1cf071725cb51131a5b3c4932bc7359d06 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 7 Jan 2024 08:29:27 +0100 Subject: [PATCH 10/23] added a scratchpad for episode III on interrupts --- .../FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 24 +++++++++++++++++++ .../FROM_BLINKER_TO_RISCV/PIPELINE.md | 2 +- .../TUTORIALS/FROM_BLINKER_TO_RISCV/README.md | 5 ++-- 3 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md new file mode 100644 index 00000000..70b2fb0d --- /dev/null +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -0,0 +1,24 @@ +# From Blinker to Risc-V Episode III - Interrupts + +This is WIP, for now just a scratchpad with notes. + +Goals: +- create a step-by-step gentle introduction, morphing the processor obtained at the end of Episode I into something that can run FreeRTOS (suggested by @jimmylu890303). +- maybe go a little bit further into the priviledged ISA, and run Linux-nommu (only if this does not require too much additional material) + + +I think that @Mecrisp's `gracilis` (extended with the memory-mapped register plus the interrupt source) has everything needed. + +- The first thing to do is of course to get the thing running. How to add a mapped register is explained [here](https://github.com/BrunoLevy/learn-fpga/tree/master/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV#step-17-memory-mapped-device---lets-do-much-more-than-a-blinky-). Then we'll need to wire the interrupt source. +- Then we'll need to write a clear explanation of how the Risc-V priviledged instruction set works. This will require some writing, because I think that the official specification [here](https://riscv.org/wp-content/uploads/2017/05/riscv-privileged-v1.10.pdf) is very difficult to read: + - it lists all possible CSRs, whereas we only need to explain a couple of them + - clarify what are in-processor CSRs and memory-mapped ones (it is not super clear to me !) + - explain what happens when an interrupt is fired and what happens when one returns from an interrupt +- We may also need to explain the RISC-V interrupt controller specification [PLIC](https://9p.io/sources/contrib/geoff/riscv/riscv-plic.pdf) . It is unclear to me what is CLINT, what is PLIC etc..., need to read more. + +For the tutorial, I'd like to continue with the "step by step incremental modification" approach of episode I, so the "scenario" could be something like (first draft): +- start from the 'quark' obtained at the end of episode I +- add `interrupt_request` wire, and `mstatus`, `mtvec` CSRs. Wire `interrupt_request` to a physical button. Write a simple example program that does something interesting. For instance, we could have an ascii animation of a bouncing ball, running in an infinite loop, and the interrupt adds a random force to the ball. With two buttons, we could write something like a 'pong' or 'breakout' game. +- add timer interrupt source. Write an example with minimalistic multitasking, demonstrating context swapping (@Mecrisp has it already). For instance, we could have two or three balls bouncing on the screen, each ball has its own thread. +- now an example with both timer interrupt source and buttons: multithreaded pong game, one thread for the ball, one thread for the paddle, one thread for game logic +- run FreeRTOS (maybe a couple of intermediary steps needed, in particular about simulation / verilator etc...) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/PIPELINE.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/PIPELINE.md index 984e8f83..2b350091 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/PIPELINE.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/PIPELINE.md @@ -1,4 +1,4 @@ -# From Blinker to RISC-V episode II +# From Blinker to RISC-V episode II - Pipelining In the [previous episode](README.md), we learnt how to create a fully functional RISC-V processor on a FPGA. Our processor is not the most efficient, diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md index dab9e63c..58e3a0d6 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md @@ -36,8 +36,9 @@ it is not as fun). These graphic program are all displayed in text mode on the terminal, using ANSI escape sequences (yes, this makes BIG pixels). For more fun, it is also possible to use a small OLED display instead (will add instructions for that in the future). -- [Episode II](PIPELINE.md) is a WIP on pipelining, that I'm currently trying to understand (and writing - a tutorial about something is a good way of making sure you understand it !). +- [Episode II](PIPELINE.md) is on pipelining, you will learn there how to transform the basic processor + obtained at the end of this tutorial into a more efficient pipelined processor with branch prediction. +- [Episode III)(INTERRUPTS.md) is a WIP on interrupts and the priviledged RISC-V ISA. - This tutorial is in VERILOG. It is currently being ported into other HDLs - [Amaranth/nMigen version](https://github.com/bl0x/learn-fpga-amaranth) by @bl0x - TODO: Silice version From ea00a30bc01f9ccc51f8fa77292e45abeecf1214 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 7 Jan 2024 10:35:47 +0100 Subject: [PATCH 11/23] put comment on program size later in the tutorial, where we learn how to compile C (better there I think) --- FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md index 58e3a0d6..a18f26e3 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md @@ -2779,12 +2779,6 @@ extracts the parts that are interesting for us and writes them in ASCII hexadeci _Note_ you can invoke `make xxxx.bram.hex` directly, it will invoke the assembler, linker and elf conversion utility for you automatically. -_Note_ on the IceStick, we only have `6kB` of RAM, so only tiny programs will fit. If the compiled -program is larger than `6kB` then you will get an error. A more problematic case is a program that -nearly fills the whole BRAM, then we have nearly no space for the stack, and the stack will overwrite -the rest, putting the CPU in an invalid state, probably frozen. This situation is difficult to understand / -to debug when you encounter it, so `firmware_words` displays a big warning message whenever the generated -code fills more than 95% of the BRAM. Now you can run the example in simulation and on the device: ``` @@ -2904,6 +2898,13 @@ Now you can see that your processor is not just a toy, it is a real RISC-V processor on which you can run programs produced by standard tools ! +_Note_ on the IceStick, we only have `6kB` of RAM, so only tiny programs will fit. If the compiled +program is larger than `6kB` then you will get an error. A more problematic case is a program that +nearly fills the whole BRAM, then we have nearly no space for the stack, and the stack will overwrite +the rest, putting the CPU in an invalid state, probably frozen. This situation is difficult to understand / +to debug when you encounter it, so `firmware_words` displays a big warning message whenever the generated +code fills more than 95% of the BRAM. + ## Step 22: Storing data: can I have more than 6 kB of memory ? _and some optimizations in the processor_ From 1c6f345518fa06639f8fdb230778bf0d957108c6 Mon Sep 17 00:00:00 2001 From: Bruno Date: Mon, 8 Jan 2024 08:53:44 +0100 Subject: [PATCH 12/23] more notes for INTERRUPT.md scratchpad --- .../FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 111 ++++++++++++++++-- LiteX/software/README.md | 3 +- 2 files changed, 101 insertions(+), 13 deletions(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index 70b2fb0d..3fb292ad 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -3,22 +3,109 @@ This is WIP, for now just a scratchpad with notes. Goals: -- create a step-by-step gentle introduction, morphing the processor obtained at the end of Episode I into something that can run FreeRTOS (suggested by @jimmylu890303). -- maybe go a little bit further into the priviledged ISA, and run Linux-nommu (only if this does not require too much additional material) + +- create a step-by-step gentle introduction, morphing the processor + obtained at the end of Episode I into something that can run FreeRTOS + (suggested by @jimmylu890303). + +- maybe go a little bit further into the priviledged ISA, and run + Linux-nommu (only if this does not require too much additional + material) I think that @Mecrisp's `gracilis` (extended with the memory-mapped register plus the interrupt source) has everything needed. -- The first thing to do is of course to get the thing running. How to add a mapped register is explained [here](https://github.com/BrunoLevy/learn-fpga/tree/master/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV#step-17-memory-mapped-device---lets-do-much-more-than-a-blinky-). Then we'll need to wire the interrupt source. -- Then we'll need to write a clear explanation of how the Risc-V priviledged instruction set works. This will require some writing, because I think that the official specification [here](https://riscv.org/wp-content/uploads/2017/05/riscv-privileged-v1.10.pdf) is very difficult to read: +- The first thing to do is of course to get the thing running. How to + add a mapped register is explained + [here](https://github.com/BrunoLevy/learn-fpga/tree/master/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV#step-17-memory-mapped-device---lets-do-much-more-than-a-blinky-). Then + we'll need to wire the interrupt source. + +- Then we'll need to write a clear explanation of how the Risc-V + priviledged instruction set works. This will require some writing, + because I think that the official specification + [here](https://riscv.org/wp-content/uploads/2017/05/riscv-privileged-v1.10.pdf) + is very difficult to read: + - it lists all possible CSRs, whereas we only need to explain a couple of them - - clarify what are in-processor CSRs and memory-mapped ones (it is not super clear to me !) - - explain what happens when an interrupt is fired and what happens when one returns from an interrupt -- We may also need to explain the RISC-V interrupt controller specification [PLIC](https://9p.io/sources/contrib/geoff/riscv/riscv-plic.pdf) . It is unclear to me what is CLINT, what is PLIC etc..., need to read more. + + - clarify what are in-processor CSRs and memory-mapped ones (it is + not super clear to me !) + + - explain what happens when an interrupt is fired and what happens + when one returns from an interrupt + +- We may also need to explain the RISC-V interrupt controller + specification + [PLIC](https://9p.io/sources/contrib/geoff/riscv/riscv-plic.pdf). + It is unclear to me what is CLINT, what is PLIC etc..., need to + read more. + +For the tutorial, I'd like to continue with the "step by step +incremental modification" approach of episode I, so the "scenario" +could be something like (first draft): -For the tutorial, I'd like to continue with the "step by step incremental modification" approach of episode I, so the "scenario" could be something like (first draft): - start from the 'quark' obtained at the end of episode I -- add `interrupt_request` wire, and `mstatus`, `mtvec` CSRs. Wire `interrupt_request` to a physical button. Write a simple example program that does something interesting. For instance, we could have an ascii animation of a bouncing ball, running in an infinite loop, and the interrupt adds a random force to the ball. With two buttons, we could write something like a 'pong' or 'breakout' game. -- add timer interrupt source. Write an example with minimalistic multitasking, demonstrating context swapping (@Mecrisp has it already). For instance, we could have two or three balls bouncing on the screen, each ball has its own thread. -- now an example with both timer interrupt source and buttons: multithreaded pong game, one thread for the ball, one thread for the paddle, one thread for game logic -- run FreeRTOS (maybe a couple of intermediary steps needed, in particular about simulation / verilator etc...) + +- add `interrupt_request` wire, and `mstatus`, `mtvec` CSRs. Wire + `interrupt_request` to a physical button. Write a simple example + program that does something interesting. For instance, we could have + an ascii animation of a bouncing ball, running in an infinite loop, + and the interrupt adds a random force to the ball. With two buttons, + we could write something like a 'pong' or 'breakout' game. + +- add timer interrupt source. Write an example with minimalistic + multitasking, demonstrating context swapping (@Mecrisp has it + already). For instance, we could have two or three balls bouncing on + the screen, each ball has its own thread. + +- now an example with both timer interrupt source and buttons: + multithreaded pong game, one thread for the ball, one thread for the + paddle, one thread for game logic + +- run FreeRTOS (maybe a couple of intermediary steps needed, in + particular about simulation / verilator etc...) + +List of questions: + +- what is the minimal list of CSRs and instructions needed to run FreeRTOS ? First guess: + - mepc: saved PC + - mtvec: interrupt handler + - mstatus, MIE bit (3) (Interrupt Enable). Do we need other bits ? + - mcause, interrupt bit (31): do we need other bits ? (e.g., distinguish timer and UART) + - mtime, mtimecmp +- system calls in FreeRTOS, do they use a trap ? Do we need to distinguish hw interrupt + from system call ? (additional bit in mcause). +- do we need different protection levels for FreeRTOS ? +- @mecrisp has an `interrupt_request_sticky` flipflop (that transitions to 1 whenever + the `interrupt_request` goes high, and that transitions to zero after one returns from + an exception handler). How does it relate with the `MIP` bit (instruction pending) ? +- the specification mentions "memory-mapped CSRs", are they supposed to be both in the + processor and projected to memory space ? (or any combination of both options ?). I + think that it is compliant with the norm as soon as the `CSRRx` instruction works + (either in-silicon with in-core regs or implemented in a trap handler). So in most + cases, the minimal in-silicon kernel has `mepc`, `mtvec`, `mstatus` and `mcause`, + and triggers an exception as soon as other CSRs are accessed. Then `mtime` and + `mtimecmp` can be either implemented in another piece of hw (that fires the + `interrupt_requet` wire) or directly implemented in-silicon. I wonder what is the + gain of the first option (external `mtime`, `mtimecmp`), does it make the CPU + simpler ? I am unsure, because in the end the weight of CPU plus interrupt controller + will be more or less the same (maybe the address decoder for the CSRs can be simpler, + we can use 1-hot encoding in the page of memory-mapped CSRs, and let the trap handler + do the CSR address translation). +- this leads to the questions of what is PLIC,CLINT,CLIC ? (I guess they are + specifications of how things should work with a separate interrupt logic with + memory-mapped CSRs, especially in a multicore context where each core can access + other core's CSRs, but is is still **very unclear** to me). In particular, which + one is relevant for us ? Do we need to implement one of them or can we do + something simpler ? + - [PLIC](https://github.com/riscv/riscv-plic-spec/blob/master/riscv-plic.adoc) + - [(A)CLINT](https://github.com/riscv/riscv-aclint/blob/main/riscv-aclint.adoc) + - [CLIC](https://github.com/riscv/riscv-fast-interrupt/blob/master/clic.adoc) + +Links: + +- @cnlohr's [minirv32](https://github.com/cnlohr/mini-rv32ima) + +- Linux-capable @ultraembedded's [exact-step](https://github.com/ultraembedded/exactstep/blob/master/cpu-rv32/rv32.cpp) + +- @regymm [quasi-soc](https://github.com/regymm/quasiSoC) diff --git a/LiteX/software/README.md b/LiteX/software/README.md index d222c7d0..9d43803a 100644 --- a/LiteX/software/README.md +++ b/LiteX/software/README.md @@ -20,4 +20,5 @@ Links, stuff to port - [dos-like](https://github.com/mattiasgustavsson/dos-like) - [tiny-gl](https://github.com/C-Chads/tinygl) - [tcc-riscv](https://github.com/sellicott/tcc-riscv32) -- [Bubble Universe](https://stardot.org.uk/forums/viewtopic.php?t=25833&sid=33182a6ffa6f84b08bb6f52cae2ad35d) \ No newline at end of file +- [Bubble Universe](https://stardot.org.uk/forums/viewtopic.php?t=25833&sid=33182a6ffa6f84b08bb6f52cae2ad35d) +- [shecc](https://github.com/sysprog21/shecc) From 99e441830b6fa175987ddd8baaade3641b408a6d Mon Sep 17 00:00:00 2001 From: Bruno Date: Mon, 8 Jan 2024 09:02:42 +0100 Subject: [PATCH 13/23] small update of notes --- .../TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index 3fb292ad..d2d9688a 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -82,11 +82,11 @@ List of questions: - the specification mentions "memory-mapped CSRs", are they supposed to be both in the processor and projected to memory space ? (or any combination of both options ?). I think that it is compliant with the norm as soon as the `CSRRx` instruction works - (either in-silicon with in-core regs or implemented in a trap handler). So in most - cases, the minimal in-silicon kernel has `mepc`, `mtvec`, `mstatus` and `mcause`, + (either with in-core regs or implemented in a trap handler). So in most + cases, the minimal in-core kernel has `mepc`, `mtvec`, `mstatus` and `mcause`, and triggers an exception as soon as other CSRs are accessed. Then `mtime` and `mtimecmp` can be either implemented in another piece of hw (that fires the - `interrupt_requet` wire) or directly implemented in-silicon. I wonder what is the + `interrupt_requet` wire) or directly implemented in-core. I wonder what is the gain of the first option (external `mtime`, `mtimecmp`), does it make the CPU simpler ? I am unsure, because in the end the weight of CPU plus interrupt controller will be more or less the same (maybe the address decoder for the CSRs can be simpler, @@ -97,7 +97,11 @@ List of questions: memory-mapped CSRs, especially in a multicore context where each core can access other core's CSRs, but is is still **very unclear** to me). In particular, which one is relevant for us ? Do we need to implement one of them or can we do - something simpler ? + something simpler ? I think that PLIC and (A)CLINT are for multi-core systems + (how a core can access another core's CSR, that are memory-mapped). So I think + that if one of these is relevant for us, it is probably CLIC. But I also think + that we probably need none of these, if we implement `mtime` / `mtimecmp` + in-core (unless FreeRTOS supposes they are memory-mapped). - [PLIC](https://github.com/riscv/riscv-plic-spec/blob/master/riscv-plic.adoc) - [(A)CLINT](https://github.com/riscv/riscv-aclint/blob/main/riscv-aclint.adoc) - [CLIC](https://github.com/riscv/riscv-fast-interrupt/blob/master/clic.adoc) From c971acff5859cfc9ba0f1b726d8c76daeebae25b Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 14 Jan 2024 17:07:13 +0100 Subject: [PATCH 14/23] wrote a paragraph about Matthia's interrupts management in FemtoRV --- .../FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 84 ++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index d2d9688a..3499eb9c 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -3,6 +3,7 @@ This is WIP, for now just a scratchpad with notes. Goals: +====== - create a step-by-step gentle introduction, morphing the processor obtained at the end of Episode I into something that can run FreeRTOS @@ -66,6 +67,7 @@ could be something like (first draft): particular about simulation / verilator etc...) List of questions: +================== - what is the minimal list of CSRs and instructions needed to run FreeRTOS ? First guess: - mepc: saved PC @@ -106,10 +108,88 @@ List of questions: - [(A)CLINT](https://github.com/riscv/riscv-aclint/blob/main/riscv-aclint.adoc) - [CLIC](https://github.com/riscv/riscv-fast-interrupt/blob/master/clic.adoc) -Links: +Interrupts, Exception, Traps +============================ + +Definitions +----------- + +- Exception: unusual condition of run-time associated with an instruction +- Trap: synchronous transfer to a trap handler caused by exceptional condition +- Interrupt: external event that occurs asynchronously +(if I understand well, a trap is what you return from using Xret. An exception is + what triggers a trap from the current instruction, and an interrupt is what triggers + a trap asynchronously, from the timer, or from a special wire). + + +Interrupts in existing FemtoRV cores +------------------------------------ + +Matthias has developed three FemtoRVs with interrupt support: +- intermissum (RV32-IM) +- gracilis (RV32-IMC) +- individua (RV32-IMAC) + +The interrupt logic is common to the three of them. They +have an additional wire `interrupt_request` that triggers an interrupt + +They implement the following CSRs: +- `mepc`: saved program counter +- `mtvec`: address of the interrupt handler +- `mstatus` bit x: interrupt enable +- `mcause` bit x: interrupt cause (and lock: already in interrupt handler) +- there is also an `interrupt_request_sticky` flipflop + +Besides writing/reading the new CSRs (easy), we need to make three modifications in +our core: +- 1 how the `interrupt_request` discusses with the rest of the chip +- 2 how (and when) do we jump to a trap handler +- 3 how do we return from a trap handler (that is, what `mret` does) + +**1: how `interrupt_request` discusses with `interrupt_sticky`:** + +`interrupt_request` only talks to `interrupt_sticky`, and the rest of the chip only sees `interrupt_sticky`. +- if `interrupt_request` goes high, `interrupt_sticky` goes high +- if `interrupt_sticky` is high, it stays high until the interrupt has been processed (that is, until we go through + the `execute` state that does what should be done with the interrupt). +**2: how (and when) do we jump to a trap handler ?** + +we just need to do three things: +- jump to the trap handler, that is, set `PC` to `mtvec` +- save return address, that is, set `mepc` to `PC+4` (or `PC+2` if it is a RV32C instruction) +- indicate that we are in a trap handler, by setting bit 31 of `mcause` (indicates that we are in an interrupt) + +It is done in the `EXECUTE` stage under three conditions: +- there is an interrupt pending (`interrupt_request_sticky` is asserted) and +- interrupts are enabled (`MIE`, that is `mstatus`[3] is set) and +- we are not in an interrupt handler already (`mcause`[31]) is not asserted + +**3: how do we return from a trap handler ?** +- reset `mcause[31]` to 0 +- jump to the return address in `mepc` + +It is done in the `EXECUTE` state. `mepc` is selected by the `PC_next` mux when the current instruction is `mret` + +**Another view of what happens when an interrupt is triggered** +- 1 `interrupt_request` is asserted by the external interrupt source +- 2 `interrupt_sticky` goes high (and remains high until we are in `EXECUTE` +- 3 `EXECUTE` sets `mcause[31]`, saves the return address to `mepc` and jumps to the trap handler. + `interrupt_sticky` goes low +- 4 the instructions in the trap handler are executed until current instruction is `mret` +- 5 `EXECUTE` processes `mret` (resets `mcause[31]` and jumps to `mepc`) + +Question: in the Risc-V norm, `mstatus` has a `mip` bit (machine interrupt pending). Is it +different or is it the same thing as our `interrupt_sticky` ? + +Links: +====== - @cnlohr's [minirv32](https://github.com/cnlohr/mini-rv32ima) -- Linux-capable @ultraembedded's [exact-step](https://github.com/ultraembedded/exactstep/blob/master/cpu-rv32/rv32.cpp) +- Linux-capable @ultraembedded's simulator [exact-step](https://github.com/ultraembedded/exactstep/blob/master/cpu-rv32/rv32.cpp) - @regymm [quasi-soc](https://github.com/regymm/quasiSoC) + +- @MrBossman [kisc-v](https://github.com/Mr-Bossman/KISC-V) + +- @splinedrive [Kian risc-V](https://github.com/splinedrive/kianRiscV) From bc4f8451795a0434259ae2c83ab26deabd841dbd Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 14 Jan 2024 22:21:29 +0100 Subject: [PATCH 15/23] updated notes on interrupts --- .../FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index 3499eb9c..10bf6296 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -14,7 +14,7 @@ Goals: material) -I think that @Mecrisp's `gracilis` (extended with the memory-mapped register plus the interrupt source) has everything needed. +I think that @Mecrisp's `individua` (extended with the memory-mapped register plus the interrupt source) has everything needed. - The first thing to do is of course to get the thing running. How to add a mapped register is explained @@ -182,10 +182,51 @@ It is done in the `EXECUTE` state. `mepc` is selected by the `PC_next` mux when Question: in the Risc-V norm, `mstatus` has a `mip` bit (machine interrupt pending). Is it different or is it the same thing as our `interrupt_sticky` ? +What I think we need for FreeRTOS +================================= + +- We probably need the 'A' instructions (so we can start from FemtoRV-individua) +- We probably need the `ECALL` instruction and the associated bits in `mcause` +- We need `mtime`,`mtimeh` (we can reuse `mcycles`, `mcyclesh`) +- We need `mtimecmp`,`mtimehcmp` and the associated bits in `mcause` +- We probably need an external interrupt source for the UART (we can use the existing `interrupt_request`) +- We may need the `mscratch` CSR +- We probably need `mtval` (machine bad address or instruction) + +What I think we need for Linux-noMMU +==================================== + +Let us take a look at @cnlohr's miniRV32. It has: +- `mstatus`, `mscratch`, `mtvec`, `mie`, `mip`, `mepc`, `mtval`, `mcause` +- `cycle[l,h]`, `timer[l,h]`, `timermatch[l,h]` (Q: can't we use cycle as timer ? Q: is timer written ?) +- `extraflags`: privilege (2 bits), WFI (1 bit), Load/Store reservation LSBs (what's that ?) + +Remarks, questions: +- It seems we only need the 'm' CSR bank, cool. +- @cnlohr's code is short and easy to read. +- what is load/store reservation ? +- take a closer look at extraflags +- mini-rv32ima.c contains the "SOC" +- what is the minimum required amount of RAM ? + +What is the bare minimal amount of hw to be able to run Linux-noMMU ? +===================================================================== + +@MrBossman's [kisc-v](https://github.com/Mr-Bossman/KISC-V) has an interesting super minimalistic implementation, +that emulates the priviledged ISA in trap handlers. It has a trap mechanism that exchanges PC with a pointer stored +at a given address whenever an unknown instruction is encountered. + +But there are several things I need to understand: +- how does it make the difference between traps and interrupts ? +- how does it masks interrupts ? +- how does it handle pending interrupts ? + Links: ====== - @cnlohr's [minirv32](https://github.com/cnlohr/mini-rv32ima) +- Stack Overflow questions referenced in minirv32 [here](https://stackoverflow.com/questions/61913210/risc-v-interrupt-handling-flow/61916199#61916199) + - Linux-capable @ultraembedded's simulator [exact-step](https://github.com/ultraembedded/exactstep/blob/master/cpu-rv32/rv32.cpp) - @regymm [quasi-soc](https://github.com/regymm/quasiSoC) From 5c76b38d3678669660731158315adeb610b7d447 Mon Sep 17 00:00:00 2001 From: Bruno Date: Mon, 15 Jan 2024 07:42:14 +0100 Subject: [PATCH 16/23] notes on interrupts, WIP --- FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index 10bf6296..e1cf8e3b 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -218,8 +218,16 @@ at a given address whenever an unknown instruction is encountered. But there are several things I need to understand: - how does it make the difference between traps and interrupts ? + - all regs are copied at fixed address by trap handler + - trap handler changes the stack pointer to `_sstack` (system stack ?) + - trap handler calls `entry` + - trap handler restores regs + - trap handler jumps to address `back`, that has instruction with opcode `1` (unsupported, so it swaps PC and saved PC) + - `cause` is deduced from `intc` array at fixed address. Where is `intc` written ? In HDL ? Yes, it seems that it is + memory-mapped registers in the interrupt controller. - how does it masks interrupts ? - how does it handle pending interrupts ? +- what happens if an interrupts occurs when in trap handler ? is it noted as pending ? Links: ====== From 0451c03077a8ca04fa374b80353cecd9559b1fca Mon Sep 17 00:00:00 2001 From: Bruno Date: Mon, 15 Jan 2024 09:41:12 +0100 Subject: [PATCH 17/23] more notes on interrupts --- .../FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index e1cf8e3b..f73d4169 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -229,6 +229,45 @@ But there are several things I need to understand: - how does it handle pending interrupts ? - what happens if an interrupts occurs when in trap handler ? is it noted as pending ? +Draft for the new tutorial +========================== + +The first concept to introduce is sw implementation of new instructions through +trap handler, because we are going to reuse it for memory-mapped CSRs accessed +through SYSTEM instructions. + +- 1. Intro, basic notions (traps, interrupts, exceptions) +- 2. Implementing instructions in software + - the bare minimum: `mtvec`, `mepc`, `mret` (maybe hardwired `mtvec`) + - software implementation of RV32M + - software implementation of RV32F (in RV32I and in RV32M: nested traps) + - mixed software/hardware implementation of RV32F (all variants of FMA and + comparisons in hardware, and the rest (FDIV,FSQRT...) in software). +- 3. Interrupts + - now we need `mcause` so that the trap handler can discriminate + - external interruts source, a `interrupt_request` wire + - a timer interrupt, basic PLIC implementation, CSRs projected in memory + space, and access to the CSRs in trap handler +- 4. Run FreeRTOS (maybe in a separate tutorial) + - The memory environment: FreeRTOS memory mapping, memory-mapped CSRs + - The instructions to be supported, and CSR access in trap handlers + +- 5. Linux (maybe in a separate tutorial) + - The memory environment: Linux memory mapping, memory-mapped CSRs + - The instructions to be supported, and CSR access in trap handlers + +The smallest (NoMMU)-Linux-capable Core +======================================= + +In HW: a non-standard trap-handler mechanism (Mr Bossman's kisc-v) that +works as follows: +- `mtvec` is a hardwired constant address +- There is a memory-mapped `mepc` CSR +- Each time an unrecognized instruction reaches `EXECUTE`, jump to `mepc` and + replace `mepc` with `PC+4` +- There is also PLIC-like interrupt logic, with memory-mapped + `mip`, `mie`, `mstatus`, `mcause` + Links: ====== - @cnlohr's [minirv32](https://github.com/cnlohr/mini-rv32ima) From 5bb5c5e5490d31366e41e16d8b5822a62d70254a Mon Sep 17 00:00:00 2001 From: Bruno Date: Fri, 19 Jan 2024 17:39:17 +0100 Subject: [PATCH 18/23] added link to doc on timers interrupts by Daniel Magum' --- FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index f73d4169..de079d1d 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -270,6 +270,9 @@ works as follows: Links: ====== + +- Daniel Magum's [detailed explanations on timers interrupts](https://danielmangum.com/posts/risc-v-bytes-timer-interrupts/) + - @cnlohr's [minirv32](https://github.com/cnlohr/mini-rv32ima) - Stack Overflow questions referenced in minirv32 [here](https://stackoverflow.com/questions/61913210/risc-v-interrupt-handling-flow/61916199#61916199) From aba73360cd12ca0f6d20b2278b30c0f86bc1fbcc Mon Sep 17 00:00:00 2001 From: Bruno Date: Tue, 7 May 2024 10:14:11 +0200 Subject: [PATCH 19/23] Added links --- .../FROM_BLINKER_TO_RISCV/FIRMWARE/GL_tty.h | 460 ++++++++++++++++++ .../FIRMWARE/humanshader.c | 113 +++++ .../FROM_BLINKER_TO_RISCV/FIRMWARE/pi.c | 5 +- .../FROM_BLINKER_TO_RISCV/INTERRUPTS.md | 1 + README.md | 14 + 5 files changed, 591 insertions(+), 2 deletions(-) create mode 100644 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/GL_tty.h create mode 100644 FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/humanshader.c diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/GL_tty.h b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/GL_tty.h new file mode 100644 index 00000000..a61932c0 --- /dev/null +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/GL_tty.h @@ -0,0 +1,460 @@ +/** + * ansi_graphics.h + * A couple of function to display graphics in the terminal, + * using ansi sequences. + * Bruno Levy, Jan 2024 + */ + +#include +#include +#include +#include + +#ifndef GL_FPS +#define GL_FPS 30 +#endif + +#if defined(__linux__) || defined(_WIN32) || defined(__APPLE__) +#define BIGCPU // we are compiling for a real machine +#else +#define TINYCPU // we are compiling for a softwore +#endif + +#ifdef __linux__ +#include // for usleep() +#endif + +// You can define GL_width and GL_height before +// #including ansi_graphics.h in case the plain +// old 80x25 pixels does not suffice. + +#ifndef GL_width +#define GL_width 80 +#endif + +#ifndef GL_height +#define GL_height 25 +#endif + +/** + * \brief Sets the current graphics position + * \param[in] x typically in 0,79 + * \param[in] y typically in 0,24 + */ +static inline void GL_gotoxy(int x, int y) { + printf("\033[%d;%dH",y,x); +} + +/** + * \brief Sets the current graphics position + * \param[in] R , G , B the RGB color of the pixel, in [0..255] + * \details Typically used by programs that draw all pixels sequentially, + * like a raytracer. After each line, one can either printf("\n") or + * call GL_gotoxy(). If you want to draw individual pixels in an + * arbitrary order, use GL_setpixelRGB(x,y,R,G,B) + */ +static inline void GL_setpixelRGBhere(uint8_t R, uint8_t G, uint8_t B) { + // set background color, print space + printf("\033[48;2;%d;%d;%dm ",(int)R,(int)G,(int)B); +} + + +/** + * \brief Draws two "pixels" at the current + * cursor position and advances the current cursor + * position. + * \details Characters are roughly twice as high as wide. + * To generate square pixels, this function draws two pixels in + * the same character, using the special lower-half white / upper-half + * black character, and setting the background and foreground colors. + */ +static inline void GL_set2pixelsRGBhere( + uint8_t r1, uint8_t g1, uint8_t b1, + uint8_t r2, uint8_t g2, uint8_t b2 +) { + if((r2 == r1) && (g2 == g1) && (b2 == b1)) { + GL_setpixelRGBhere(r1,g1,b1); + } else { + printf("\033[48;2;%d;%d;%dm",(int)r1,(int)g1,(int)b1); + printf("\033[38;2;%d;%d;%dm",(int)r2,(int)g2,(int)b2); + // https://www.w3.org/TR/xml-entity-names/025.html + // https://onlineunicodetools.com/convert-unicode-to-utf8 + // https://copypastecharacter.com/ + printf("\xE2\x96\x83"); + } +} + +#define GL_RGB(R,G,B) #R ";" #G ";" #B + +static inline void GL_setpixelIhere( + const char** cmap, int c +) { + // set background color, print space + printf("\033[48;2;%sm ",cmap[c]); +} + +static inline void GL_set2pixelsIhere( + const char** cmap, int c1, int c2 +) { + if(c1 == c2) { + GL_setpixelIhere(cmap, c1); + } else { + printf("\033[48;2;%sm",cmap[c1]); + printf("\033[38;2;%sm",cmap[c2]); + // https://www.w3.org/TR/xml-entity-names/025.html + // https://onlineunicodetools.com/convert-unicode-to-utf8 + // https://copypastecharacter.com/ + printf("\xE2\x96\x83"); + } +} + +/** + * \brief Moves the cursor position to the next line. + * \details Background and foreground colors are set to black. + */ +static inline void GL_newline() { + printf("\033[38;2;0;0;0m"); + printf("\033[48;2;0;0;0m\n"); +} + +/** + * \brief Sets the color of a pixel + * \param[in] x typically in 0,79 + * \param[in] y typically in 0,24 + * \param[in] R , G , B the RGB color of the pixel, in [0..255] + */ +static inline void GL_setpixelRGB( + int x, int y, uint8_t R, uint8_t G, uint8_t B +) { + GL_gotoxy(x,y); + GL_setpixelRGBhere(R,G,B); +} + +/** + * \brief restore default foreground and background colors + */ +static inline void GL_restore_default_colors() { + printf( + "\033[48;5;16m" // set background color black + "\033[38;5;15m" // set foreground color white + ); +} + +/** + * \brief Call this function each time graphics should be cleared + */ +static inline void GL_clear() { + GL_restore_default_colors(); + printf("\033[2J"); // clear screen +} + +/** + * \brief Moves current drawing position to top-left corner + * \see GL_setpixelRGBhere() and GL_set2pixelsRGBhere() + */ +static inline void GL_home() { + printf("\033[H"); +} + +/** + * \brief Call this function before starting drawing graphics + * or each time graphics should be cleared + */ +static inline void GL_init() { + printf("\033[?25l"); // hide cursor + GL_home(); + GL_clear(); +} + + +/** + * \brief Call this function at the end of the program + */ +static inline void GL_terminate() { + GL_restore_default_colors(); + GL_gotoxy(0,GL_height); + printf("\033[?25h"); // show cursor +} + +/** + * \brief Flushes pending graphic operations and waits a bit + */ +static inline void GL_swapbuffers() { + // only flush if we are on a big machine, with true stdio support + // otherwise does nothing (because our small MCU io lib is not buffered) +#ifdef BIGCPU + fflush(stdout); +#endif +#ifdef __linux__ + usleep(1000000/GL_FPS); +#endif +} + +typedef void (*GL_pixelfunc_RGB)(int x, int y, uint8_t* r, uint8_t* g, uint8_t* b); +typedef void (*GL_pixelfunc_RGBf)(int x, int y, float* r, float* g, float* b); + +/** + * \brief Draws an image by calling a user-specified function for each pixel. + * \param[in] width , height dimension of the image in square pixels + * \param[in] do_pixel the user function to be called for each pixel + * (a "shader"), that determines the (integer) components r,g,b of + * the pixel's color. + * \details Uses half-charater pixels. + */ +static inline void GL_scan_RGB( + int width, int height, GL_pixelfunc_RGB do_pixel +) { + uint8_t r1, g1, b1; + uint8_t r2, g2, b2; + GL_home(); + for (int j = 0; j 1.0f) ? 1.0f : f; + return (uint8_t)(255.0f * f); +} + +/** + * \brief Draws an image by calling a user-specified function for each pixel. + * \param[in] width , height dimension of the image in square pixels + * \param[in] do_pixel the user function to be called for each pixel + * (a "shader"), that determines the (floating-point) components + * fr,fg,fb of the pixel's color. + * \details Uses half-charater pixels. + */ +static inline void GL_scan_RGBf( + int width, int height, GL_pixelfunc_RGBf do_pixel +) { + float fr1, fg1, fb1; + float fr2, fg2, fb2; + uint8_t r1, g1, b1; + uint8_t r2, g2, b2; + GL_home(); + for (int j = 0; j XMAX)<<1) | (((y) < YMIN)<<2) | (((y) > YMAX)<<3) + +/***************************************************************/ + +static inline void GL_line( + int x1, int y1, int x2, int y2, int R, int G, int B +) { + int x,y,dx,dy,sx,sy,tmp; + + /* Cohen-Sutherland line clipping. */ + int code1 = code(x1,y1); + int code2 = code(x2,y2); + int codeout; + + for(;;) { + /* Both points inside. */ + if(code1 == 0 && code2 == 0) { + break; + } + + /* No point inside. */ + if(code1 & code2) { + return; + } + + /* One of the points is outside. */ + codeout = code1 ? code1 : code2; + + /* Compute intersection. */ + if (codeout & TOP) { + x = x1 + (x2 - x1) * (YMAX - y1) / (y2 - y1); + y = YMAX; + } else if (codeout & BOTTOM) { + x = x1 + (x2 - x1) * (YMIN - y1) / (y2 - y1); + y = YMIN; + } else if (codeout & RIGHT) { + y = y1 + (y2 - y1) * (XMAX - x1) / (x2 - x1); + x = XMAX; + } else if (codeout & LEFT) { + y = y1 + (y2 - y1) * (XMIN - x1) / (x2 - x1); + x = XMIN; + } + + /* Replace outside point with intersection. */ + if (codeout == code1) { + x1 = x; + y1 = y; + code1 = code(x1,y1); + } else { + x2 = x; + y2 = y; + code2 = code(x2,y2); + } + } + + // Swap both extremities to ensure x increases + if(x2 < x1) { + tmp = x2; + x2 = x1; + x1 = tmp; + tmp = y2; + y2 = y1; + y1 = tmp; + } + + // Bresenham line drawing. + dy = y2 - y1; + sy = 1; + if(dy < 0) { + sy = -1; + dy = -dy; + } + + dx = x2 - x1; + + x = x1; + y = y1; + + if(dy > dx) { + int ex = (dx << 1) - dy; + for(int u=0; u= 0) { + x++; + ex -= dy << 1; + GL_setpixelRGB(x,y,R,G,B); + } + while(ex >= 0) { + x++; + ex -= dy << 1; + putchar(' '); + } + ex += dx << 1; + } + } else { + int ey = (dy << 1) - dx; + for(int u=0; u= 0) { + y += sy; + ey -= dx << 1; + GL_setpixelRGB(x,y,R,G,B); + } + ey += dy << 1; + } + } +} + + +/***************************************************************/ + +#ifdef GL_USE_TURTLE + +#include "sintab.h" // Ugly !!! + +typedef struct { + int x; // in [0..79] + int y; // in [0..24] + int angle; // in degrees + int R,G,B; // pen color + int pendown; // draw if non-zero +} Turtle; + +static inline void Turtle_init(Turtle* T) { + T->x = GL_width/2; + T->y = GL_height/2; + T->angle = -90; + T->pendown = 1; + T->R = 255; + T->G = 255; + T->B = 255; +} + +static inline void Turtle_pen_up(Turtle* T) { + T->pendown = 0; +} + +static inline void Turtle_pen_down(Turtle* T) { + T->pendown = 1; +} + +static inline void Turtle_pen_color(Turtle* T, int R, int G, int B) { + T->R = R; + T->G = G; + T->B = B; +} + +static inline void Turtle_forward(Turtle* T, int distance) { + int last_x = T->x; + int last_y = T->y; + int a = T->angle; + while(a < 0) { + a += 360; + } + while(a > 360) { + a -= 360; + } + T->x += (costab[a] * distance) / 256; + T->y += (sintab[a] * distance) / 256; + if(T->pendown) { + GL_line(last_x, last_y, T->x, T->y, T->R, T->G, T->B); + } +} + +static inline void Turtle_backward(Turtle* T, int distance) { + Turtle_forward(T,-distance); +} + +static inline void Turtle_turn_right(Turtle* T, int delta_angle) { + T->angle += delta_angle; +} + +static inline void Turtle_turn_left(Turtle* T, int delta_angle) { + Turtle_turn_right(T, -delta_angle); +} + +#endif diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/humanshader.c b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/humanshader.c new file mode 100644 index 00000000..72a419ce --- /dev/null +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/humanshader.c @@ -0,0 +1,113 @@ +// C version of humanshader +// See https://humanshader.com/ +// (using a computer is clearly not as fun, but it is interesting to have +// a small not too computationally expensive raytracing program that +// can run on small softcores for PGAs). +// Using the 16-bits version with no divide from here: https://www.shadertoy.com/view/XflXDs + +#define GL_width 71 +#define GL_height 40 +#include "GL_tty.h" + +void human_shader( + int x, int y, uint8_t* r_out, uint8_t* g_out, uint8_t* b_out +) { + int R, B; + + //------------------------- + // Section A (2 MUL, 3 ADD) + //------------------------- + int u = x-36; + int v = 18-y; + int u2 = u*u; + int v2 = v*v; + int h = u2 + v2; + //------------------------- + + if( h < 200 ) + { + //------------------------------------- + // Section B, Sphere (4/7 MUL, 5/9 ADD) + //------------------------------------- + R = 420; + B = 520; + + int t = 5200 + (h<<3); + int p = (t*u)>>7; + int q = (t*v)>>7; + + // bounce light + int w = 18 + (((p*5-q*13))>>9); + if( w>0 ) R += w*w; + + // sky light / ambient occlusion + int o = q + 900; + R = (R*o)>>12; + B = (B*o)>>12; + + // sun/key light + if( p > -q ) + { + int w = (p+q)>>3; + R += w; + B += w; + } + //------------------------- + } + else if( v<0 ) + { + //------------------------------------- + // Section C, Ground (5/9 MUL, 6/9 ADD) + //------------------------------------- + R = 150 + (v<<1); + B = 50; + + int p = h + (v2<<3); + int c = 240*(-v) - p; + + // sky light / ambient occlusion + if( c>1200 ) + { + int o = (25*c)>>3; + o = (c*(7840-o)>>9) - 8560; + R = (R*o)>>10; + B = (B*o)>>10; + } + + // sun/key light with soft shadow + int r = c + u*v; + int d = 3200 - h - (r<<1); + if( d>0 ) R += d; + //------------------------- + } + else + { + //------------------------------ + // Section D, Sky (1 MUL, 2 ADD) + //------------------------------ + int c = x + (y<<2); + R = 132 + c; + B = 192 + c; + //------------------------- + } + + //------------------------- + // Section E (3 MUL, 1 ADD) + //------------------------- + if(R > 255) R = 255; + if(B > 255) B = 255; + + int G = (R*11 + 5*B)>>4; + //------------------------- + + *r_out = (uint8_t)R; + *g_out = (uint8_t)G; + *b_out = (uint8_t)B; +} + +int main() { + GL_init(); + GL_scan_RGB(GL_width, GL_height, human_shader); + GL_terminate(); + return 0; +} diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/pi.c b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/pi.c index 80e19de1..dd17a91a 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/pi.c +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/FIRMWARE/pi.c @@ -88,8 +88,9 @@ int is_prime(int n) if ((n % 2) == 0) return 0; - r = (int) (sqrt(n)); - for (i = 3; i <= r; i += 2) + //r = (int) (sqrt(n)); + //for (i = 3; i <= r; i += 2) + for (i = 3; i*i <= n; i += 2) if ((n % i) == 0) return 0; return 1; diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md index de079d1d..c9cf7cff 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/INTERRUPTS.md @@ -284,3 +284,4 @@ Links: - @MrBossman [kisc-v](https://github.com/Mr-Bossman/KISC-V) - @splinedrive [Kian risc-V](https://github.com/splinedrive/kianRiscV) + diff --git a/README.md b/README.md index 4549516f..e953abe6 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,20 @@ functional RISC-V core that can compute and display graphics. In [Episode II](https://github.com/BrunoLevy/learn-fpga/blob/master/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/PIPELINE.md), you will learn how to design a pipelined processor. +Links - Other FPGA resources +---------------------------- +- [TinyPrograms](https://github.com/BrunoLevy/TinyPrograms) Tiny yet interesting C programs to play with your softcore +- [LiteX](https://github.com/enjoy-digital/litex) Framework in Amaranth (Python-based HDL) to build SOCs +- [Silice](https://github.com/sylefeb/Silice) A new HDL by my friend Sylvain Lefebvre +- [PipelineC](https://github.com/JulianKemmerer/PipelineC) Transform a C program into a pipelined specialized core ! +- [ultraembedded](https://github.com/ultraembedded/) Amazing resources, [FatIOLib](https://github.com/ultraembedded/fat_io_lib),[ExactStep](https://github.com/ultraembedded/exactstep)... +- [pivoRV](https://github.com/YosysHQ/picorv32) by Claire Wolf, my principal source of inspiration +- [VexRiscV](https://github.com/SpinalHDL/VexRiscv) and [NaxRiscV](https://github.com/SpinalHDL/NaxRiscv), performant and configurable pipelined and OoO cores, by Charles Papon, in SpinalHDL +- [DarkRiscV](https://github.com/darklife/darkriscv) a simple pipelined core (written in one night according to the legend) +- [kianRiscV](https://github.com/splinedrive/kianRiscV) a simple yet complete Linux-capable core + soc +- [Will Green's project F](https://github.com/projf/projf-explore) tutorials with nice graphics effects +- [fpga4fun](https://www.fpga4fun.com/) learned there how to create VGA graphics + Basic: more basic things I wrote during May 2020 - June 2020 ------------------------------------------------------------ Files are [here](https://github.com/BrunoLevy/learn-fpga/tree/master/Basic). From b77d6798b75d6e6bd7f0ef5fbb841bfb98eb2822 Mon Sep 17 00:00:00 2001 From: Bruno Date: Tue, 7 May 2024 10:55:01 +0200 Subject: [PATCH 20/23] Added more links --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e953abe6..1d8627ae 100644 --- a/README.md +++ b/README.md @@ -44,14 +44,17 @@ Links - Other FPGA resources - [TinyPrograms](https://github.com/BrunoLevy/TinyPrograms) Tiny yet interesting C programs to play with your softcore - [LiteX](https://github.com/enjoy-digital/litex) Framework in Amaranth (Python-based HDL) to build SOCs - [Silice](https://github.com/sylefeb/Silice) A new HDL by my friend Sylvain Lefebvre +- [FuseSOC](https://github.com/olofk/fusesoc) and [Edalize](https://github.com/olofk/edalize), package manager and abstraction of FPGA tools - [PipelineC](https://github.com/JulianKemmerer/PipelineC) Transform a C program into a pipelined specialized core ! - [ultraembedded](https://github.com/ultraembedded/) Amazing resources, [FatIOLib](https://github.com/ultraembedded/fat_io_lib),[ExactStep](https://github.com/ultraembedded/exactstep)... -- [pivoRV](https://github.com/YosysHQ/picorv32) by Claire Wolf, my principal source of inspiration +- [picoRV](https://github.com/YosysHQ/picorv32) by Claire Wolf, my principal source of inspiration - [VexRiscV](https://github.com/SpinalHDL/VexRiscv) and [NaxRiscV](https://github.com/SpinalHDL/NaxRiscv), performant and configurable pipelined and OoO cores, by Charles Papon, in SpinalHDL +- [SERV](https://github.com/olofk/serv) the tiniest RiscV core, with a bit-serial ALU - [DarkRiscV](https://github.com/darklife/darkriscv) a simple pipelined core (written in one night according to the legend) - [kianRiscV](https://github.com/splinedrive/kianRiscV) a simple yet complete Linux-capable core + soc - [Will Green's project F](https://github.com/projf/projf-explore) tutorials with nice graphics effects - [fpga4fun](https://www.fpga4fun.com/) learned there how to create VGA graphics +- [CoreScore](https://corescore.store/) how many cores can you fit on a FPGA ? Basic: more basic things I wrote during May 2020 - June 2020 ------------------------------------------------------------ From 61b9b1c18d3962f4c2c2d0b57356c531f69b9424 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sat, 11 May 2024 12:09:07 +0200 Subject: [PATCH 21/23] Added link to tinysys --- FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md | 2 +- README.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md index a18f26e3..d3f80b30 100644 --- a/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md +++ b/FemtoRV/TUTORIALS/FROM_BLINKER_TO_RISCV/README.md @@ -719,7 +719,7 @@ The register bank is implemented as follows: ``` Let us take a closer look at what we need to to to execute an instruction. -Condider for instance a stream of R-type instructions. For each instruction, +Consider for instance a stream of R-type instructions. For each instruction, we need to do the following four things: - fetch the instruction: `instr <= MEM[PC]` diff --git a/README.md b/README.md index 1d8627ae..9e0d52db 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Links - Other FPGA resources - [SERV](https://github.com/olofk/serv) the tiniest RiscV core, with a bit-serial ALU - [DarkRiscV](https://github.com/darklife/darkriscv) a simple pipelined core (written in one night according to the legend) - [kianRiscV](https://github.com/splinedrive/kianRiscV) a simple yet complete Linux-capable core + soc +- [TinySys](https://github.com/ecilasun/tinysys/wiki) not that tiny SOC and OS - [Will Green's project F](https://github.com/projf/projf-explore) tutorials with nice graphics effects - [fpga4fun](https://www.fpga4fun.com/) learned there how to create VGA graphics - [CoreScore](https://corescore.store/) how many cores can you fit on a FPGA ? From 23f0d90ab639e73300d116c949c1839fa613056c Mon Sep 17 00:00:00 2001 From: Bruno Date: Sun, 23 Feb 2025 19:36:07 +0100 Subject: [PATCH 22/23] Added ref to transputer T9000 FPU --- FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c | 7 +------ FemtoRV/FIRMWARE/EXAMPLES/pi.c | 4 ++-- FemtoRV/TUTORIALS/FPU.md | 3 +++ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c b/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c index d4a6e5d1..377e1c66 100644 --- a/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c +++ b/FemtoRV/FIRMWARE/EXAMPLES/hello_LED.c @@ -6,12 +6,7 @@ int main() { MAX7219_tty_init(); // redirect printf() to led matrix scroller for(;;) { -// printf("Hello, RISC-V world \001 \002 \001 \002 "); -// printf("Hello, TelecomNancy ! \001 \002 Best school ! \001 \002 "); -// printf("Hello FemtoRV friend !!! \001 \002 \001 \002 "); -// printf("Hello, Hackaday \001 \002 Greetings from FemtoRV !!! "); -// printf("Hello, RISC-V world \001 \002 \001 \002 "); - printf("Hello, caf\202 LoOPS ! \001 \002 \001 \002 "); + printf("Hello, RISC-V world \001 \002 \001 \002 "); } return 0; } diff --git a/FemtoRV/FIRMWARE/EXAMPLES/pi.c b/FemtoRV/FIRMWARE/EXAMPLES/pi.c index ea3e63b3..95e75abb 100644 --- a/FemtoRV/FIRMWARE/EXAMPLES/pi.c +++ b/FemtoRV/FIRMWARE/EXAMPLES/pi.c @@ -176,8 +176,8 @@ int digits(int n) { int main() { MAX7219_tty_init(); // Uncomment to display on led matrix. -// femtosoc_tty_init(); -// GL_set_font(&Font3x5); + femtosoc_tty_init(); + GL_set_font(&Font3x5); // GL_set_font(&Font8x16); printf("pi = 3."); for(int n=1; ;n+=9) { diff --git a/FemtoRV/TUTORIALS/FPU.md b/FemtoRV/TUTORIALS/FPU.md index ed016724..da026175 100644 --- a/FemtoRV/TUTORIALS/FPU.md +++ b/FemtoRV/TUTORIALS/FPU.md @@ -656,3 +656,6 @@ References - [A FPU written in system verilog](https://github.com/taneroksuz/riscv-fpu) - [Berkeley SoftFloat and HardFloat](http://www.jhauser.us/arithmetic/) + +- [T9000 transputer FPU design](https://transputer.net/fbooks/t9000/t9kfpdsn.pdf) +- [Transputer](https://thechipletter.substack.com/p/inmos-and-the-transputer-instruction) From e468d3fee7265f81156400b3f7b982645957c7d5 Mon Sep 17 00:00:00 2001 From: Bruno Date: Tue, 25 Feb 2025 10:54:00 +0100 Subject: [PATCH 23/23] Notes on Morph project --- FemtoRV/TUTORIALS/Morph.md | 51 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 FemtoRV/TUTORIALS/Morph.md diff --git a/FemtoRV/TUTORIALS/Morph.md b/FemtoRV/TUTORIALS/Morph.md new file mode 100644 index 00000000..5eb7730e --- /dev/null +++ b/FemtoRV/TUTORIALS/Morph.md @@ -0,0 +1,51 @@ +FemtoRV-Morph - Notes +===================== + +Mission statement: create a Risc-V processor with an _interesting_ +capabilities/complexity/performance ratio. The main idea is to start from +an RV32ICZicsr core like Gracilis, add the instructions that have an +interesting area/performance ratio (e.g., MUL variants and a subset of RV32F) +and implement the rest in software traps. There would be two versions, +Caterpillar and QuickSilver (abbreviated Hg). + +FemtoRVMorph-Caterpillar +======================== + +- RV32ICZicsr (Gracilis base) + +- RV32M: MUL, MULH, MULHSU, MULHU (DIV, DIVU, REM, REMU in sw trap) + +- Implement a subset of RV32F (separate fp registers) or Zfinx (shared registers) + Zfinx may be the way to go (much simpler datapath, smaller nb instructions), + and Zfinx could be used to emulate RV32F in trap ! + + It is mostly a FMA (Fused-Multiply-Add) unit, used to implement: + - Sum and product: FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB + - Comparison: FEQ, FLT, FLE + - Load/Store (RV32F): FLW, FSW + - integer reg <-> fp reg (RV32F): FMVXW, FMVWX + - All the rest in sw trap + +- Questions: + - RV32F or Zfinx ? + - single-precision or double-precision ? + +FemtoRVMorph-Hg (QuickSilver) +============================= + +- Pipelined with branch prediction +- I$, D$ caches +- MCT (Minimal Cost Trap) mechanism: cooperation between illegal instruction + trap mechanism and address prediction logic, fast context switch logic +- Pipelined FMA unit +- Some sort of Conway/Scoreboard/Tomasulo dynamic execution mechanism to make + best use of pipelined FMA unit + +- Questions: + - RV32F or Zfinx ? Probably start with Zfinx (one difficulty at a time: + more complicated RV32F datapath will be harder with pipeline), then + "morph" it to RV32F + - single-precision or double-precision ? Depends on FPGA capabilities. + - other instructions in hw: + - vector math + - extensions implemented by Hazard3 (huge performance gain it seems)