diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8439a9d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*~ +*.out +*.vcd diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..839197d --- /dev/null +++ b/Makefile @@ -0,0 +1,31 @@ +# Assembly simulation in Verilog unified Makefile example + +include settings.mk + +GTKWAVE := gtkwave +SIM := vvp + +# Final waveform to produce is the combination of machine and program +WAVEFORM := $(TOPLEVEL)-$(PROGRAM).vcd +WAVEOPTS := filters/$(WAVEFORM:vcd=gtkw) + + +# Build memory image, compile Verilog, run simulation to produce VCD trace +$(WAVEFORM): settings.mk + $(MAKE) -C asm $(MEMDUMP) + $(MAKE) -C verilog $(TOPLEVEL).vvp + $(SIM) verilog/$(TOPLEVEL).vvp +mem_fn=asm/$(MEMDUMP) +dump_fn=$@ + + +# Open waveform with saved formatting and filter options +scope: $(WAVEFORM) $(WAVEOPTS) + $(GTKWAVE) $(WAVEOPTS) + + +# Remove generated files, including from subdirectories +clean: + $(MAKE) -C asm clean + $(MAKE) -C verilog clean + rm -f $(WAVEFORM) + +.PHONY: scope clean diff --git a/REPORT.MD b/REPORT.MD new file mode 100644 index 0000000..406cb56 --- /dev/null +++ b/REPORT.MD @@ -0,0 +1,111 @@ +# Lab 3 Writeup +### William Derksen, Alexander Hoppe, Sam Myers, Taylor Sheneman + +## Processor Architecture + +In this lab we designed a Single Cycle CPU complete with an Instruction Fetch Unit, Instruction Decoder, Arithmetic Logic Unit and associated logic, Memory Access, and Writeback to Registers. We based the design off of our previous in-class designs, with a couple important changes in order to accommodate different instructions. + +Broadly, our architecture uses the value from the Program Counter to fetch instructions from the first segment of memory (from address 0 - 8192). We then decode this instruction into each of the major components of the I- J- and R-type encodings, as well as into an array of control signals depending on the opcode. These signals and controls affect the Program counter in the instruction fetch unit (in order to branch and jump), the ALU (to execute mathematical operations), the Memory (to load and store), and the Register File (for determining I J and R type instructions). + +Our processor architecture supports a subset of the MIPS ISA, consisting of `LW`, `SW`, `J`, `JR`, `JAL`, `BNE`, `XORI`, `ADDI`, `ADD`, `SUB`, and `SLT`. + +Changelog: + +In order to correctly to do the new instruction `xori`, we needed to create a more versatile Sign Extend component that could also Zero Extend if needed. We came up with a simple design with a Sign Extend control signal that allowed for both of these processes without needing a bitwise multiplexer. We did this by setting the seventeenth value of the input to the `and` of the control signal and the sixteenth value of the input. Then we sign extend naturally from the 17th value. + +Additionally, for jump and link we added two new muxes, one for choosing register `$31` in order to do `jal`, and one right before the writeback to the `Dw` port for the regfile. This sets the PC to the value that is to be written to register `$31`. For `jal`, the jump part is the same as a normal jump instruction. + +When first trying to run code on our CPU, we changed our architecture from a Harvard architecture with separate memory for instructions and data to Von Neumann architecture with one contiguous memory to be more compatible with the Mars assembler's output for MIPS. We also changed our program counter to increment by 1, and our `jal` instruction to store `PC + 2` because our memory is word-addressed by 32-bit word as opposed to byte-addressing. + +While running basic tests for the first time, we were confronted with issues loading and storing data to memory. For some reason we were only executing odd-numbered instructions for a while, which was very problematic. We also were having loads and stores executing out of order. + +out-of-order + +Eventually we traced this back to memory and writeback instructions being triggered `@ (posedge clk)` which was preventing instructions from being fetched properly and also preventing loads and stores from executing except for every other cycle. We solved this by establishing the single-cycle timing scheme below and enforcing it, moving memory write and writeback operations to the negative clock edge. + +clock-cycle + +This diagram has the program counter updating on the positive edge, then instructions being read from memory combinationally, decoded combinationally, ALU result being computed combinationally, and then the memory or writeback happening on negative edge. In a real single-cycle CPU we'd have to worry about timing for all of these, but for our model the combinational steps are instantaneous so this is not an issue. + + +### Block Diagram +top-level-block-diagram + +Our single-cycle CPU is divided up into two main modules, with one shared memory between all of them. This design arose because we originally implemented a Harvard architecture, but then pivoted to Von Neumann later down the line. + +#### Instruction Fetch +Inside our instruction-fetch module we keep track of the program counter and any jump or branch signals that need to modify the PC input to instruction memory or set it to a different value. + +IF + +Below is the table of all of the control signals for our CPU and all of the different states they are in. In this diagram, control signals that do not matter for a particular instruction are drawn in blue, and those that are important are in red. + +decode + +#### Data Path + +Below is the data path module, which can operate on two registers or an immediate and write its arithmetic result to memory or a register, or store the program counter (plus two instructions) in the case of a `jal`. We did make a custom module to do sign-extending or logical-extending for `xori`, since the difference is just what we extend with, zeros or the MSB. + +data-path + + + +### RTL Example + +To more clearly illustrate the path of an instruction through our CPU, the following is a walkthrough of a single cycle of program execution using the `addi` instruction: + +`Reg[rt] = Reg[rs] + SignExtend(Imm16)` + +#### IF + +The first stage is instruction fetch, kicked off by the positive edge of our clock incrementing the program counter. The program counter input is already fed a combinational `PC + 1` (since our memory is 32 bit word-addressed) so the PC is simply a positive-edge-triggered D-flipflop. After incrementing, the PC is fed into the instruction address port of our memory, which combinationally returns the next instruction on the associated data port. + +#### ID / EX + +Once the instruction is read from memory, it is then fed through our instruction decode unit. This unit combinationally sets the register addresses `rs` and `rt` and also the `imm16` as well as decoding the opcode of the instruction into the proper control signals seen in the previous section. + +Next is the execute section, which also executes combinationally. The output of the regfile `Da` is fed into the ALU, and the other input is muxed to the output of our sign-extend module, which sign-extends `imm16`. The ALU result is then displayed combinationally. Since there are no delays in our ideal system, this all happens instantaneously after the positive clock edge. + +#### MEM / WB + +After the ID and EX phases have finished, the memory and writeback phases (MEM and WB) execute on the negative edge of the clock. On the negative clock edge the result of the ALU is clocked back into the `Dw` port of the regfile, which has been set to `rt` by the instruction decode module. The memory is not written to in this case, as `addi` is an I-type instruction so the memory write enable is disabled by the ID unit. + +The clock cycle for all of this would have to be selected to be sufficiently long for the combinational logic of a memory read and then an ALU operation, plus a memory write on the negative part of the clock cycle. + +## Test Plan +We implemented various assembly-based tests to test our CPU and also to push our assembly programming skills. For the very basic first functionality test, we implemented `asm/add.asm` which is just a series of adds. We also implemented a complete test of our instruction set in `asm/basic_testbench.asm`, and finally we implemented the quicksort algorithm in assembly, which can be found in `asm/quicksort.asm`. + +#### add.asm +The add test was just a series of `addi` instructions to the same `$t0` register to allow us to debug our ability to load memory into our CPU and test the instruction fetch order. It actually allowed us to debug our instruction fetch bug in which instructions were taking two cycles to execute. It turned out that the instruction output of our memory was only updating `@ (address)` which was the data memory address port, so the instruction memory wasn't being output until the writeback state of a previous instruction. + +#### Quicksort +In addition to a simpler test bench, we decided to target the implementation of a more challenging algorithm to really push our assembly skills, in this case the quicksort algorithm. This algorithm requires us to use all seven instructions in our reduced set. + +We started by implementing the quicksort algorithm in C, which allowed us to get the control flow down in a language that we could pretty easily expand out into direct memory manipulations and register operations. From there we expanded it out into assembly, primarily the work of @Halliax Taylor Sheneman. Once it successfully executed in MARS, we loaded it into our CPU and were met with success! It was a difficult program to use to debug the CPU, but it provided a stress test to make sure our hardware worked in an actual implementation. + + +#### Assembly Test Bench +The assembly test bench was a program designed to test all of the different functions in our ISA. It has explicit test subroutines for `lw` and `sw`, `bne`, `xori`, `addi`, `add`, `sub`, and `slt`. We chose to omit tests for `j`, `jr`, and `jal` because the rest of the testbench used them so heavily that it would never complete successfully if they were not correct. Each subroutine sets a return value in a top level register, `$s7`, which is checked before executing additional subroutines. If `$s7` is set to 1, a test has failed and the program exits (falls into a jump trap loop). If at the end of the program, the `$s7` register is zero, then the testbench succeeded. The testbench ends with a MARS syscall which displays the return value in `$s7`, which successfully returned zero when run on the MARS MIPS emulator. + +This testbench is what is run by using the `./test.sh` script in the top level directory of the project. + +This testbench allowed us to debug our memory load and store issue having to do with the read-execute-write cycle happening all at the same clock cycle. Once we resolved this issue, we were met with the following result: + +test-success + +As is visible here, the `$s7` register was still `0x00000000` at the time of program end. This can be seen as the `Da` bus (combinational output of the regfile) is all zeros during instruction 73 when the input address `rs` is `$s7`. This means the test bench succeeded. + + +## Performance/Area Design analysis +Since our design is single-cycle and written behaviorally for code to be uploaded later, the Vivado synthesis of our program tends to optimize out pretty much all of the actual execution of instructions, since the memory is default initialized to zero. This is interpreted by the synthesizer as effectively having an empty program, hence it does not need any actual CPU. + +Failing synthesizer performance and area analysis, there are a few performance and area heuristic analyses that can be applied. First, our project is fundamentally limited in throughput by nature of it only being able to execute one instruction at a time in a clock cycle equivalent to the entire length of execution. We can execute an instruction in the time it takes to load an instruction from memory, decode it, do an ALU computation, and then in the worst case write back to memory. This is quite slow, as usually memory access takes much longer than other computing operations. + +We're also limited in area, as we cannot do any module reuse within the single-cycle CPU. For example, we have an additional ALU in our CPU for doing the adding to increment the `PC`, because we can't do reuse when our main program ALU is going to be required later in the same cycle. We were able to reuse our memory, however, because our read and write access happens at different times during the clock cycle. We could potentially have saved area by consolidating our read and write ports into one, since they're accessed at different times, but this would require more timing control signals than just having combinational outputs. + +While a single-cycle design is limited in these ways, it's also much simpler to design than a much higher-throughput pipeline design. It's streamlined in that it can all be drawn out on one schematic without any need for mental juggling of timing. It was an MVP for this project, and it ended up being sufficiently challenging. + +## Work Plan Reflection +As is often the case, we underestimated how much work this lab would take us in the beginning. We had originally planned to rush to MVP with a single-cycle CPU and then branch out to complete a pipeline CPU if we had time. In the end, it took us the first week to plan out how everything would be implemented which was on schedule, but then the code implementation of our CPU took more time than just the weekend. Fortunately, at the same time we were able to develop our assembly test bench, so when we actually did finish implementation on Wednesday of the final week, we were able to flash code on it immediately. + +The place we could've improved the process here was in allowing more time for debugging and parallelizing more. We were met with a couple of very difficult issues over the course of the last few days of the project, and with only Thursday and Friday to debug them, we were really under the gun. In fact, we only barely finished passing our test benches at around 11:59 on Friday (hence the report is quite late). We also could have parallelized our development process more. When we developed our test benches and implemented our verilog at the same time, we were much more productive than when we were single-threadedly debugging our program at the end of the week. The independent clock and memory access issues could have been resolved at the same time, and the report could have been written during that time as well. diff --git a/RISC_RTL.md b/RISC_RTL.md new file mode 100644 index 0000000..e9af40f --- /dev/null +++ b/RISC_RTL.md @@ -0,0 +1,85 @@ +# Single cycle MIPS RISC RTL + +## Instructions: +`LW`, `SW`, `J`, `JR`, `JAL`, `BNE`, `XORI`, `ADDI`, `ADD`, `SUB`, `SLT` + +## RTL + +### `LW` +I-type +Load Word - 23 +``` +R[rt] = M[R[rs]+SignExtImm] +``` + +### `SW` +I-type +Store Word - 43 +``` +M[R[rs]+SignExtImm] = R[rt] +``` + +### `J` +J-type +Jump - 2 +``` +PC=JumpAddr +``` + +### `JR` +R-type Funct 8 +Jump Register - 0 +``` +PC=R[rs] +``` + +### `JAL` +J-type +Jump and Link - 3 +``` +R[31] = PC+8; +PC = JumpAddr +``` + +### `BNE` +I-type +Branch On Not Equal - 5 +``` +if(R[rs]!=R[rt]) + PC=PC+4+BranchAddr +``` + +### `XORI` +I-type +Xor immediate - 14 +``` +R[rt] = R[rs] ^ ZeroExtImm +``` + +### `ADDI` +I-type +Add Immediate - 8 +``` +R[rt] = R[rs] + SignExtImm +``` + +### `ADD` +R-Type Funct 8 +Add - 0 +``` +R[rd] = R[rs] + R[rt] +``` + +### `SUB` +R-type Funct 22 +Subtract - 0 +``` +R[rd] = R[rs] - R[rt] +``` + +### `SLT` +R-type Funct 42 +Set Less Than - 0 +``` +R[rd] = (R[rs] < R[rt]) ? 1 : 0 +``` diff --git a/asm/Makefile b/asm/Makefile new file mode 100644 index 0000000..ab8c462 --- /dev/null +++ b/asm/Makefile @@ -0,0 +1,27 @@ +# Generate machine code memory image from MIPS assembly + +# Get PROGRAM and MEMDUMP from project settings +include ../settings.mk + +MARS_PATH := ~/Documents/CompArch/mips/Mars4_5.jar +MARS_OPTS := a mc CompactTextAtZero +MARS := java -jar $(MARS_PATH) $(MARS_OPTS) + + +# Pattern rule for generating .text memory dump from MIPS assembly +%.text.hex: %.asm + $(MARS) dump .text HexText $@ $< + +# Pattern rule for generating .data memory dump from MIPS assembly +%.data.hex: %.asm + $(MARS) dump .data HexText $@ $< + + +# Shortcut (phony) targets for convenience +assemble: $(MEMDUMP) + +clean: + -rm -f $(MEMDUMP) + + +.PHONY: assemble clean diff --git a/asm/add.asm b/asm/add.asm new file mode 100644 index 0000000..4d4a879 --- /dev/null +++ b/asm/add.asm @@ -0,0 +1,14 @@ +# most basic test function for MIPS reduced Instantiate + +nop +addi $t0, $zero, 0 +addi $t0, $zero, 1 +addi $t0, $zero, 2 +addi $t0, $zero, 3 +addi $t0, $zero, 4 +addi $t0, $zero, 5 +addi $t0, $zero, 6 +addi $t0, $zero, 7 +addi $t0, $zero, 8 +addi $t0, $zero, 9 +addi $t0, $zero, 10 diff --git a/asm/basic_testbench.asm b/asm/basic_testbench.asm new file mode 100644 index 0000000..3493485 --- /dev/null +++ b/asm/basic_testbench.asm @@ -0,0 +1,151 @@ +# Test bench function for Lab3 reduced MIPS ISA +# $s7 = ('tests failed') ? 1 : 0 +nop +addi $gp, $zero, 0x2000 + +main: # Run all tests conditionally + +jal test_lw_sw +bne $s7, $zero, test_end + +jal test_bne +bne $s7, $zero, test_end + +jal test_xori +bne $s7, $zero, test_end + +jal test_add +bne $s7, $zero, test_end + +jal test_sub +bne $s7, $zero, test_end + +jal test_slt +bne $s7, $zero, test_end + +j test_end + + +test_lw_sw: +# Initialize values +addi $t0, $zero, 30 +addi $t1, $zero, 87 +addi $t2, $zero, 4 +# store to heap +sw $t0, 16($gp) +sw $t1, 12($gp) +sw $t2, 8($gp) +# load from heap +lw $t3, 16($gp) +lw $t4, 12($gp) +lw $t5, 8($gp) +# compare equality +bne $t0, $t3, lw_sw_fail +bne $t1, $t4, lw_sw_fail +bne $t2, $t5, lw_sw_fail +jr $ra + +lw_sw_fail: +addi $s7, $zero, 1 #set testfailed to true +jr $ra + +test_bne: +# initialize values +addi $t0, $zero, 40 +addi $t1, $zero, 16 +addi $t2, $zero, 40 + +bne $t0, $t2, bne_fail +bne $t0, $t1, bne_pass +bne_fail: +addi $s7, $zero, 1 +bne_pass: +jr $ra + +test_xori: +#initialize values +addi $t0, $zero, 0xf0 +addi $t1, $zero, 0x0f + +# xor identical things +xori $t2, $t0, 0xf0 +# fail if not zero +bne $t2, $zero, xori_fail +# xor different things +xori $t2, $t1, 0xf0 +# pass if zero +bne $t2, $zero, xori_pass +xori_fail: +addi $s7, $zero, 1 +xori_pass: +jr $ra + +test_add: +# initialize values +addi $t0, $zero, 13 +addi $t1, $zero, 31 +addi $t2, $zero, 44 + +# add registers +add $t3, $t0, $t1 +# fail if not expected result +bne $t3, $t2, add_fail +# otherwise pass +j add_pass + +add_fail: +addi $s7, $zero, 1 +add_pass: +jr $ra + + +test_sub: +# initialize values +addi $t0, $zero, 31 +addi $t1, $zero, 13 +addi $t2, $zero, 18 + +# subtract registers +sub $t3, $t0, $t1 +# fail if not expected result +bne $t3, $t2, sub_fail +# otherwise pass +j sub_pass + +sub_fail: +addi $s7, $zero, 1 +sub_pass: +jr $ra + +test_slt: +# initialize values +addi $t0, $zero, 13 +addi $t1, $zero, 15 + +# 13 is less than 15 +slt $t2, $t0, $t1 +bne $t2, 1, slt_fail + +# 15 is not less than 13 +slt $t2, $t1, $t0 +bne $t2, 0, slt_fail +j slt_pass + +slt_fail: +addi $s7, $zero, 1 +slt_pass: +jr $ra + + +# End the program +test_end: +add $a0, $s7, $zero +addi $v0, $zero, 1 +syscall +j jump_trap + +jump_trap: +nop +nop +nop +j jump_trap diff --git a/asm/quicksort.asm b/asm/quicksort.asm new file mode 100644 index 0000000..501470a --- /dev/null +++ b/asm/quicksort.asm @@ -0,0 +1,231 @@ +main: +addi $sp, $zero, 0x00003ffc +la $s0, array +addi $a0, $zero, 0 +addi $a1, $zero, 9 +jal quicksort +j done + + +quicksort: +# $s0 = arr* +# $a0 = start +# $a1 = end +# $t0 = pivot +# $t1 = branch check temporary (for xori and slt) + + +# if start < end, run quicksort +slt $t1, $a0, $a1 +bne $t1, $zero, run +j end + +run: +# push frame onto stack +addi $sp, $sp, -12 +sw $ra, 8($sp) +sw $a0, 4($sp) +sw $a1, ($sp) + +# pivot = partition (arr, start, end) +jal partition + +# pop frame from stack +lw $ra, 8($sp) +lw $a0, 4($sp) +lw $a1, ($sp) +addi $sp, $sp, 12 + +add $t0, $zero, $v0 + +# push frame onto stack +addi $sp, $sp, -16 +sw $ra, 12($sp) +sw $a0, 8($sp) +sw $a1, 4($sp) +sw $t0, ($sp) + +# quicksort(arr, start, pivot - 1) +addi $a1, $t0, -1 + +jal quicksort + +# pop frame from stack +lw $ra, 12($sp) +lw $a0, 8($sp) +lw $a1, 4($sp) +lw $t0, ($sp) +addi $sp, $sp, 16 + +# push frame onto stack +addi $sp, $sp, -12 +sw $ra, 8($sp) +sw $a0, 4($sp) +sw $a1, ($sp) + +# quicksort(arr, pivot + 1, end) +add $a0, $t0, 1 + +jal quicksort + +# pop frame from stack +lw $ra, 8($sp) +lw $a0, 4($sp) +lw $a1, ($sp) +addi $sp, $sp, 12 + +end: +jr $ra + + +partition: +# $v0 = return val +# $a0 = start +# $a1 = end +# $a2 = arr index (calcMemAddr) +# $s0 = arr* +# $s1 = pivot +# $s2 = i (counter) +# $s3 = j (counter) +# $s4 = arr[i] val +# $s5 = arr[j] val +# $t0 = branch check temporary (for xori and slt) +# $t3 = arr[end] addr +# $t4 = arr[i] addr +# $t5 = arr[j] addr + +addi $sp, $sp, -4 +sw $ra, ($sp) +# ----------------------------------------------------------------- +## int pivot = arr[end] +## int i = start - 1; + +# set arr index to end and call calcMemAddr +add $a2, $zero, $a1 +jal calcMemAddr +# set arr[end] addr +add $t3, $zero, $v0 + +# set reg pivot to mem[arr[end]] +lw $s1, ($t3) + +# set i to start - 1 +sub $s2, $a0, 1 + +# ----------------------------------------------------------------- +## for (int j = start; j < end; j++) { +## if (arr[j] <= pivot) { +## i++; +## int temp = arr[i]; +## arr[i] = arr[j]; +## arr[j] = temp; +## } +## } + +# set j to start and jump to check +add $s3, $zero, $a0 +j forcheck +forloop: + +# set arr index to j and call calcMemAddr +add $a2, $zero, $s3 +jal calcMemAddr +# set arr[j] addr +add $t5, $zero, $v0 + +# set reg arr[j] to mem[arr[j]] +lw $s5, ($t5) + +# check if arr[j] <= pivot +addi $s1, $s1, 1 +slt $t0, $s5, $s1 +addi $s1, $s1, -1 + +# execute swap if slt is true +bne $t0, $zero, swap +j increment +swap: +# i++ +addi $s2, $s2, 1 + +# set arr index to i and call calcMemAddr +add $a2, $zero, $s2 +jal calcMemAddr +# set arr[i] addr +add $t4, $zero, $v0 + +# set reg arr[i] to mem[arr[i]] +lw $s4, ($t4) + +# store in opposite places +sw $s4, ($t5) +sw $s5, ($t4) + +# increment j +increment: +addi $s3, $s3, 1 +# break for loop when j = end +forcheck: +bne $s3, $a1, forloop + +# ----------------------------------------------------------------- +## int temp = arr[i + 1]; +## arr[i + 1] = arr[end]; +## arr[end] = temp; +## return i + 1; + +# set i to i + 1, arr index to i + 1 and call calcMemAddr +addi $s2, $s2, 1 +add $a2, $zero, $s2 +jal calcMemAddr + +# set reg arr[i] to mem[arr[i + 1]] +lw $s4, ($v0) +# store pivot at mem[arr[i + 1]] +sw $s1, ($v0) + +# store reg arr[i] (holding arr[i + 1]) into mem[arr[end]] +sw $s4, ($t3) + +#return i + 1 +add $v0, $zero, $s2 +lw $ra, ($sp) +addi $sp, $sp, 4 +jr $ra + + +calcMemAddr: +# $v0 = addr (return) +# $t0 = multiply counter temporary +# $t1 = branch check temporary (for xori and slt) + +# set addr to arr* and mult counter to 0 +add $v0, $zero, $s0 +addi $t0, $zero, 0 +calc: +# add index to addr, 1 to mult counter +add $v0, $v0, $a2 +addi $t0, $t0, 1 +# if mult counter != 4, loop +xori $t1, $t0, 4 +bne $t1, $zero, calc +jr $ra + + +done: +j done +# addi $v0, $zero, 10 +# syscall + +.data +array: +0x00000009 +0x00000005 +0x00000003 +0x00000006 +0x00000002 +0x00000008 +0x00000007 +0x00000003 +0x00000001 +0x00000004 diff --git a/asmtest/DRAGAN/README.md b/asmtest/DRAGAN/README.md new file mode 100644 index 0000000..2fed225 --- /dev/null +++ b/asmtest/DRAGAN/README.md @@ -0,0 +1,13 @@ +# Quicksort algorithm + +### Preconditions: + +Array { 9, 5, 3, 6, 2, 8, 7, 3, 1, 4 } loaded into .data (should be handled by quicksort.asm) + +### Expected result: + +Array { 1, 2, 3, 3, 4, 5, 6, 7, 8, 9 } in .data + +### Required additional instructions: + +N/A diff --git a/asmtest/DRAGAN/basic_testbench.asm b/asmtest/DRAGAN/basic_testbench.asm new file mode 100644 index 0000000..52433a0 --- /dev/null +++ b/asmtest/DRAGAN/basic_testbench.asm @@ -0,0 +1,146 @@ +# Test bench function for Lab3 reduced MIPS ISA +# $s7 = ('tests failed') ? 1 : 0 + +main: # Run all tests conditionally + +jal test_lw_sw +bne $s7, $zero, test_end + +jal test_bne +bne $s7, $zero, test_end + +jal test_xori +bne $s7, $zero, test_end + +jal test_add +bne $s7, $zero, test_end + +jal test_sub +bne $s7, $zero, test_end + +jal test_slt +bne $s7, $zero, test_end + +j test_end + + +test_lw_sw: +# Initialize values +addi $t0, $zero, 30 +addi $t1, $zero, 87 +# store to heap +sw $t0, 16($gp) +sw $t1, 12($gp) +# load from heap +lw $t2, 16($gp) +lw $t3, 12($gp) +# compare equality +bne $t0, $t2, lw_sw_fail +bne $t1, $t3, lw_sw_fail +jr $ra + +lw_sw_fail: +addi $s7, $zero, 1 #set testfailed to true +jr $ra + +test_bne: +# initialize values +addi $t0, $zero, 40 +addi $t1, $zero, 16 +addi $t2, $zero, 40 + +bne $t0, $t2, bne_fail +bne $t0, $t1, bne_pass +bne_fail: +addi $s7, $zero, 1 +bne_pass: +jr $ra + +test_xori: +#initialize values +addi $t0, $zero, 0x0000f0f0 +addi $t1, $zero, 0x00000f0f + +# xor identical things +xori $t2, $t0, 0x0000f0f0 +# fail if not zero +bne $t2, $zero, xori_fail +# xor different things +xori $t2, $t1, 0X0000f0f0 +# pass if zero +bne $t2, $zero, xori_pass +xori_fail: +addi $s7, $zero, 1 +xori_pass: +jr $ra + +test_add: +# initialize values +addi $t0, $zero, 13 +addi $t1, $zero, 31 +addi $t2, $zero, 44 + +# add registers +add $t3, $t0, $t1 +# fail if not expected result +bne $t3, $t2, add_fail +# otherwise pass +j add_pass + +add_fail: +addi $s7, $zero, 1 +add_pass: +jr $ra + + +test_sub: +# initialize values +addi $t0, $zero, 31 +addi $t1, $zero, 13 +addi $t2, $zero, 18 + +# subtract registers +sub $t3, $t0, $t1 +# fail if not expected result +bne $t3, $t2, sub_fail +# otherwise pass +j sub_pass + +sub_fail: +addi $s7, $zero, 1 +sub_pass: +jr $ra + +test_slt: +# initialize values +addi $t0, $zero, 13 +addi $t1, $zero, 15 + +# 13 is less than 15 +slt $t2, $t0, $t1 +bne $t2, 1, slt_fail + +# 15 is not less than 13 +slt $t2, $t1, $t0 +bne $t2, 0, slt_fail +j slt_pass + +slt_fail: +addi $s7, $zero, 1 +slt_pass: +jr $ra + + +# End the program +test_end: +add $a0, $s7, $zero +addi $v0, $zero, 1 +syscall +j jump_trap + +jump_trap: +nop +nop +nop +j jump_trap + diff --git a/asmtest/DRAGAN/quicksort.asm b/asmtest/DRAGAN/quicksort.asm new file mode 100644 index 0000000..501470a --- /dev/null +++ b/asmtest/DRAGAN/quicksort.asm @@ -0,0 +1,231 @@ +main: +addi $sp, $zero, 0x00003ffc +la $s0, array +addi $a0, $zero, 0 +addi $a1, $zero, 9 +jal quicksort +j done + + +quicksort: +# $s0 = arr* +# $a0 = start +# $a1 = end +# $t0 = pivot +# $t1 = branch check temporary (for xori and slt) + + +# if start < end, run quicksort +slt $t1, $a0, $a1 +bne $t1, $zero, run +j end + +run: +# push frame onto stack +addi $sp, $sp, -12 +sw $ra, 8($sp) +sw $a0, 4($sp) +sw $a1, ($sp) + +# pivot = partition (arr, start, end) +jal partition + +# pop frame from stack +lw $ra, 8($sp) +lw $a0, 4($sp) +lw $a1, ($sp) +addi $sp, $sp, 12 + +add $t0, $zero, $v0 + +# push frame onto stack +addi $sp, $sp, -16 +sw $ra, 12($sp) +sw $a0, 8($sp) +sw $a1, 4($sp) +sw $t0, ($sp) + +# quicksort(arr, start, pivot - 1) +addi $a1, $t0, -1 + +jal quicksort + +# pop frame from stack +lw $ra, 12($sp) +lw $a0, 8($sp) +lw $a1, 4($sp) +lw $t0, ($sp) +addi $sp, $sp, 16 + +# push frame onto stack +addi $sp, $sp, -12 +sw $ra, 8($sp) +sw $a0, 4($sp) +sw $a1, ($sp) + +# quicksort(arr, pivot + 1, end) +add $a0, $t0, 1 + +jal quicksort + +# pop frame from stack +lw $ra, 8($sp) +lw $a0, 4($sp) +lw $a1, ($sp) +addi $sp, $sp, 12 + +end: +jr $ra + + +partition: +# $v0 = return val +# $a0 = start +# $a1 = end +# $a2 = arr index (calcMemAddr) +# $s0 = arr* +# $s1 = pivot +# $s2 = i (counter) +# $s3 = j (counter) +# $s4 = arr[i] val +# $s5 = arr[j] val +# $t0 = branch check temporary (for xori and slt) +# $t3 = arr[end] addr +# $t4 = arr[i] addr +# $t5 = arr[j] addr + +addi $sp, $sp, -4 +sw $ra, ($sp) +# ----------------------------------------------------------------- +## int pivot = arr[end] +## int i = start - 1; + +# set arr index to end and call calcMemAddr +add $a2, $zero, $a1 +jal calcMemAddr +# set arr[end] addr +add $t3, $zero, $v0 + +# set reg pivot to mem[arr[end]] +lw $s1, ($t3) + +# set i to start - 1 +sub $s2, $a0, 1 + +# ----------------------------------------------------------------- +## for (int j = start; j < end; j++) { +## if (arr[j] <= pivot) { +## i++; +## int temp = arr[i]; +## arr[i] = arr[j]; +## arr[j] = temp; +## } +## } + +# set j to start and jump to check +add $s3, $zero, $a0 +j forcheck +forloop: + +# set arr index to j and call calcMemAddr +add $a2, $zero, $s3 +jal calcMemAddr +# set arr[j] addr +add $t5, $zero, $v0 + +# set reg arr[j] to mem[arr[j]] +lw $s5, ($t5) + +# check if arr[j] <= pivot +addi $s1, $s1, 1 +slt $t0, $s5, $s1 +addi $s1, $s1, -1 + +# execute swap if slt is true +bne $t0, $zero, swap +j increment +swap: +# i++ +addi $s2, $s2, 1 + +# set arr index to i and call calcMemAddr +add $a2, $zero, $s2 +jal calcMemAddr +# set arr[i] addr +add $t4, $zero, $v0 + +# set reg arr[i] to mem[arr[i]] +lw $s4, ($t4) + +# store in opposite places +sw $s4, ($t5) +sw $s5, ($t4) + +# increment j +increment: +addi $s3, $s3, 1 +# break for loop when j = end +forcheck: +bne $s3, $a1, forloop + +# ----------------------------------------------------------------- +## int temp = arr[i + 1]; +## arr[i + 1] = arr[end]; +## arr[end] = temp; +## return i + 1; + +# set i to i + 1, arr index to i + 1 and call calcMemAddr +addi $s2, $s2, 1 +add $a2, $zero, $s2 +jal calcMemAddr + +# set reg arr[i] to mem[arr[i + 1]] +lw $s4, ($v0) +# store pivot at mem[arr[i + 1]] +sw $s1, ($v0) + +# store reg arr[i] (holding arr[i + 1]) into mem[arr[end]] +sw $s4, ($t3) + +#return i + 1 +add $v0, $zero, $s2 +lw $ra, ($sp) +addi $sp, $sp, 4 +jr $ra + + +calcMemAddr: +# $v0 = addr (return) +# $t0 = multiply counter temporary +# $t1 = branch check temporary (for xori and slt) + +# set addr to arr* and mult counter to 0 +add $v0, $zero, $s0 +addi $t0, $zero, 0 +calc: +# add index to addr, 1 to mult counter +add $v0, $v0, $a2 +addi $t0, $t0, 1 +# if mult counter != 4, loop +xori $t1, $t0, 4 +bne $t1, $zero, calc +jr $ra + + +done: +j done +# addi $v0, $zero, 10 +# syscall + +.data +array: +0x00000009 +0x00000005 +0x00000003 +0x00000006 +0x00000002 +0x00000008 +0x00000007 +0x00000003 +0x00000001 +0x00000004 diff --git a/asmtest/DRAGAN/quicksort.c b/asmtest/DRAGAN/quicksort.c new file mode 100644 index 0000000..77c7142 --- /dev/null +++ b/asmtest/DRAGAN/quicksort.c @@ -0,0 +1,35 @@ +#include + +int partition(int *arr, int start, int end) { + int pivot = arr[end]; + int i = start - 1; + for (int j = start; j <= end - 1; j++) { + if (arr[j] <= pivot) { + i++; + int temp = arr[i]; + arr[i] = arr[j]; + arr[j] = temp; + } + } + int temp = arr[i + 1]; + arr[i + 1] = arr[end]; + arr[end] = temp; + return i + 1; +} + +void quicksort(int *arr, int start, int end) { + if (start < end) { + int pivot = partition(arr, start, end); + quicksort(arr, start, pivot - 1); + quicksort(arr, pivot + 1, end); + } +} + +int main() { + int arr[] = { 9, 5, 3, 6, 2, 8, 7, 3, 1, 4 }; + quicksort(arr, 0, 9); + for (int i = 0; i < 10; i++) { + printf("%d\n", arr[i]); + } + return 0; +} diff --git a/filters/fake_cpu-fib_func.gtkw b/filters/fake_cpu-fib_func.gtkw new file mode 100644 index 0000000..b39c5ac --- /dev/null +++ b/filters/fake_cpu-fib_func.gtkw @@ -0,0 +1,52 @@ +[*] +[*] GTKWave Analyzer v3.3.79 (w)1999-2017 BSI +[*] Wed Nov 15 00:28:31 2017 +[*] +[dumpfile] "fake_cpu-fib_func.vcd" +[dumpfile_mtime] "Wed Nov 15 00:24:17 2017" +[dumpfile_size] 7517 +[savefile] "filters/fake_cpu-fib_func.gtkw" +[timestart] 0 +[size] 1000 600 +[pos] -1 -1 +*-5.270822 57 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +[treeopen] cpu_test. +[treeopen] cpu_test.cpu. +[sst_width] 193 +[signals_width] 133 +[sst_expanded] 1 +[sst_vpaned_height] 168 +@28 +cpu_test.clk +cpu_test.reset +@200 +---A-- +@22 +cpu_test.cpu.PC_A[31:0] +cpu_test.cpu.INS_A[31:0] +@200 +---B-- +@22 +cpu_test.cpu.PC_B[31:0] +cpu_test.cpu.INS_B[31:0] +@2022 +^2 filters/mips-opcodes.filter +cpu_test.cpu.OP_B[5:0] +@200 +---C-- +@22 +cpu_test.cpu.PC_C[31:0] +cpu_test.cpu.INS_C[31:0] +@2022 +^4 filters/mips-funct.filter +cpu_test.cpu.FUNCT_C[5:0] +@2023 +^2 filters/mips-opcodes.filter +cpu_test.cpu.OP_C[5:0] +@2022 +^1 filters/mips-regs.filter +cpu_test.cpu.RS_C[4:0] +^1 filters/mips-regs.filter +cpu_test.cpu.RT_C[4:0] +[pattern_trace] 1 +[pattern_trace] 0 diff --git a/filters/mips-funct.filter b/filters/mips-funct.filter new file mode 100644 index 0000000..654e706 --- /dev/null +++ b/filters/mips-funct.filter @@ -0,0 +1,21 @@ +# MIPS funct codes (R-type, OP=0x00) +00 sll +02 srl +03 sra +08 jr +10 mfhi +12 mflo +18 mult +19 multu +1A div +1B divu +20 add +21 addu +22 sub +23 subu +24 and +25 or +26 xor +27 nor +2A slt +2B sltu diff --git a/filters/mips-opcodes.filter b/filters/mips-opcodes.filter new file mode 100644 index 0000000..fedc8cc --- /dev/null +++ b/filters/mips-opcodes.filter @@ -0,0 +1,20 @@ +# MIPS opcodes +00 R-type +02 j +03 jal +04 beq +05 bne +08 addi +09 addiu +0A slti +0B sltiu +0C andi +0D ori +0F lui +10 mfc0 +23 lw +24 lbu +25 lhu +28 sb +29 sh +2B sw diff --git a/filters/mips-regs.filter b/filters/mips-regs.filter new file mode 100644 index 0000000..f704bdb --- /dev/null +++ b/filters/mips-regs.filter @@ -0,0 +1,33 @@ +# MIPS register names +00 $zero +01 $at +02 $v0 +03 $v1 +04 $a0 +05 $a1 +06 $a2 +07 $a3 +08 $t0 +09 $t1 +0A $t2 +0B $t3 +0C $t4 +0D $t5 +0E $t6 +0F $t7 +10 $s0 +11 $s1 +12 $s2 +13 $s3 +14 $s4 +15 $s5 +16 $s6 +17 $s7 +18 $t8 +19 $t9 +1a $k0 +1b $k1 +1c $gp +1d $sp +1e $fp +1f $ra diff --git a/filters/singleCycleCPU-add.gtkw b/filters/singleCycleCPU-add.gtkw new file mode 100644 index 0000000..c6b14a7 --- /dev/null +++ b/filters/singleCycleCPU-add.gtkw @@ -0,0 +1,50 @@ +[*] +[*] GTKWave Analyzer v3.3.66 (w)1999-2015 BSI +[*] Fri Nov 17 05:06:51 2017 +[*] +[dumpfile] "/home/aehoppe/Documents/CompArch/Lab3/add.vcd" +[dumpfile_mtime] "Fri Nov 17 04:54:22 2017" +[dumpfile_size] 33127 +[savefile] "/home/aehoppe/Documents/CompArch/Lab3/filters/singleCycleCPU-add.gtkw" +[timestart] 0 +[size] 1920 1021 +[pos] -1 -1 +*-4.000000 50 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +[treeopen] singleCycleCPUTestHarness. +[treeopen] singleCycleCPUTestHarness.cpu. +[sst_width] 229 +[signals_width] 190 +[sst_expanded] 1 +[sst_vpaned_height] 286 +@28 +singleCycleCPUTestHarness.cpu.clk +@22 +singleCycleCPUTestHarness.cpu.PC[31:0] +@4022 +^>1 /home/aehoppe/Documents/CompArch/Lab3/filters/../../mips-dasm/mips-dasm-filter +singleCycleCPUTestHarness.cpu.instruction[31:0] +@2022 +^1 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-opcodes.filter +singleCycleCPUTestHarness.cpu.instr_decode.opcode[5:0] +^2 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-funct.filter +singleCycleCPUTestHarness.cpu.instr_decode.funct[5:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rs[4:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rt[4:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rd[4:0] +@22 +singleCycleCPUTestHarness.cpu.instr_decode.imm16[15:0] +singleCycleCPUTestHarness.cpu.instr_decode.target_address[25:0] +singleCycleCPUTestHarness.cpu.Da[31:0] +singleCycleCPUTestHarness.cpu.data_path.A[31:0] +singleCycleCPUTestHarness.cpu.Db[31:0] +@23 +singleCycleCPUTestHarness.cpu.data_path.B[31:0] +@22 +singleCycleCPUTestHarness.cpu.data_path.ALU_out[31:0] +@28 +singleCycleCPUTestHarness.cpu.data_path.ALU_src +[pattern_trace] 1 +[pattern_trace] 0 diff --git a/filters/singleCycleCPU-basic_testbench.gtkw b/filters/singleCycleCPU-basic_testbench.gtkw new file mode 100644 index 0000000..20c432f --- /dev/null +++ b/filters/singleCycleCPU-basic_testbench.gtkw @@ -0,0 +1,48 @@ +[*] +[*] GTKWave Analyzer v3.3.66 (w)1999-2015 BSI +[*] Fri Nov 17 06:32:08 2017 +[*] +[dumpfile] "/home/aehoppe/Documents/CompArch/Lab3/basic_testbench.vcd" +[dumpfile_mtime] "Fri Nov 17 06:29:27 2017" +[dumpfile_size] 98766 +[savefile] "/home/aehoppe/Documents/CompArch/Lab3/filters/singleCycleCPU-basic_testbench.gtkw" +[timestart] 0 +[size] 1920 1021 +[pos] -1 -1 +*-4.000000 10 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +[treeopen] singleCycleCPUTestHarness. +[treeopen] singleCycleCPUTestHarness.cpu. +[sst_width] 229 +[signals_width] 190 +[sst_expanded] 1 +[sst_vpaned_height] 286 +@28 +singleCycleCPUTestHarness.cpu.clk +@25 +singleCycleCPUTestHarness.cpu.PC[31:0] +@4022 +^>1 /home/aehoppe/Documents/CompArch/Lab3/filters/../../mips-dasm/mips-dasm-filter +singleCycleCPUTestHarness.cpu.instruction[31:0] +@2022 +^1 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-opcodes.filter +singleCycleCPUTestHarness.cpu.instr_decode.opcode[5:0] +^2 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-funct.filter +singleCycleCPUTestHarness.cpu.instr_decode.funct[5:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rs[4:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rt[4:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rd[4:0] +@22 +singleCycleCPUTestHarness.cpu.instr_decode.imm16[15:0] +singleCycleCPUTestHarness.cpu.instr_decode.target_address[25:0] +singleCycleCPUTestHarness.cpu.Da[31:0] +singleCycleCPUTestHarness.cpu.data_path.A[31:0] +singleCycleCPUTestHarness.cpu.Db[31:0] +singleCycleCPUTestHarness.cpu.data_path.B[31:0] +singleCycleCPUTestHarness.cpu.data_path.ALU_out[31:0] +@28 +singleCycleCPUTestHarness.cpu.data_path.ALU_src +[pattern_trace] 1 +[pattern_trace] 0 diff --git a/filters/singleCycleCPU-quicksort.gtkw b/filters/singleCycleCPU-quicksort.gtkw new file mode 100644 index 0000000..bdb96d9 --- /dev/null +++ b/filters/singleCycleCPU-quicksort.gtkw @@ -0,0 +1,39 @@ +[*] +[*] GTKWave Analyzer v3.3.66 (w)1999-2015 BSI +[*] Fri Nov 17 00:39:09 2017 +[*] +[dumpfile] "/home/aehoppe/Documents/CompArch/Lab3/quicksort.vcd" +[dumpfile_mtime] "Fri Nov 17 00:26:36 2017" +[dumpfile_size] 33166 +[savefile] "/home/aehoppe/Documents/CompArch/Lab3/filters/singleCycleCPU.gtkw" +[timestart] 0 +[size] 1920 1021 +[pos] -33 -33 +*-5.644512 10 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +[treeopen] singleCycleCPUTestHarness. +[treeopen] singleCycleCPUTestHarness.cpu. +[sst_width] 229 +[signals_width] 190 +[sst_expanded] 1 +[sst_vpaned_height] 286 +@22 +singleCycleCPUTestHarness.cpu.PC[31:0] +@4023 +^>1 /home/aehoppe/Documents/CompArch/mips-dasm/mips-dasm-filter +singleCycleCPUTestHarness.cpu.instruction[31:0] +@2022 +^1 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-opcodes.filter +singleCycleCPUTestHarness.cpu.instr_decode.opcode[5:0] +^2 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-funct.filter +singleCycleCPUTestHarness.cpu.instr_decode.funct[5:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rs[4:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rt[4:0] +^3 /home/aehoppe/Documents/CompArch/Lab3/filters/mips-regs.filter +singleCycleCPUTestHarness.cpu.instr_decode.rd[4:0] +@22 +singleCycleCPUTestHarness.cpu.instr_decode.target_address[25:0] +singleCycleCPUTestHarness.cpu.instr_decode.imm16[15:0] +[pattern_trace] 1 +[pattern_trace] 0 diff --git a/img/clock-cycle.jpg b/img/clock-cycle.jpg new file mode 100644 index 0000000..491e2d3 Binary files /dev/null and b/img/clock-cycle.jpg differ diff --git a/img/out-of-order.png b/img/out-of-order.png new file mode 100644 index 0000000..fb01e50 Binary files /dev/null and b/img/out-of-order.png differ diff --git a/img/single-cycle-cpu_DataPath.jpg b/img/single-cycle-cpu_DataPath.jpg new file mode 100644 index 0000000..71e6367 Binary files /dev/null and b/img/single-cycle-cpu_DataPath.jpg differ diff --git a/img/single-cycle-cpu_IF.jpg b/img/single-cycle-cpu_IF.jpg new file mode 100644 index 0000000..a8f1bd7 Binary files /dev/null and b/img/single-cycle-cpu_IF.jpg differ diff --git a/img/single-cycle-cpu_decode.jpg b/img/single-cycle-cpu_decode.jpg new file mode 100644 index 0000000..ff45829 Binary files /dev/null and b/img/single-cycle-cpu_decode.jpg differ diff --git a/img/single-cycle-report_block-diagram.jpg b/img/single-cycle-report_block-diagram.jpg new file mode 100644 index 0000000..bdd0441 Binary files /dev/null and b/img/single-cycle-report_block-diagram.jpg differ diff --git a/img/test-success.png b/img/test-success.png new file mode 100644 index 0000000..93aad10 Binary files /dev/null and b/img/test-success.png differ diff --git a/settings.mk b/settings.mk new file mode 100644 index 0000000..1635613 --- /dev/null +++ b/settings.mk @@ -0,0 +1,18 @@ +# Project-specific settings + +## Assembly settings + +# Assembly program (minus .asm extension) +PROGRAM := basic_testbench + +# Memory image(s) to create from the assembly program +MEMDUMP := $(PROGRAM).text.hex + + +## Verilog settings + +# Top-level module/filename (minus .v/.t.v extension) +TOPLEVEL := singleCycleCPU + +# All circuits included by the toplevel $(TOPLEVEL).t.v +CIRCUITS := $(TOPLEVEL).v diff --git a/sim_example/Makefile b/sim_example/Makefile new file mode 100644 index 0000000..839197d --- /dev/null +++ b/sim_example/Makefile @@ -0,0 +1,31 @@ +# Assembly simulation in Verilog unified Makefile example + +include settings.mk + +GTKWAVE := gtkwave +SIM := vvp + +# Final waveform to produce is the combination of machine and program +WAVEFORM := $(TOPLEVEL)-$(PROGRAM).vcd +WAVEOPTS := filters/$(WAVEFORM:vcd=gtkw) + + +# Build memory image, compile Verilog, run simulation to produce VCD trace +$(WAVEFORM): settings.mk + $(MAKE) -C asm $(MEMDUMP) + $(MAKE) -C verilog $(TOPLEVEL).vvp + $(SIM) verilog/$(TOPLEVEL).vvp +mem_fn=asm/$(MEMDUMP) +dump_fn=$@ + + +# Open waveform with saved formatting and filter options +scope: $(WAVEFORM) $(WAVEOPTS) + $(GTKWAVE) $(WAVEOPTS) + + +# Remove generated files, including from subdirectories +clean: + $(MAKE) -C asm clean + $(MAKE) -C verilog clean + rm -f $(WAVEFORM) + +.PHONY: scope clean diff --git a/sim_example/README.md b/sim_example/README.md new file mode 100644 index 0000000..de29209 --- /dev/null +++ b/sim_example/README.md @@ -0,0 +1,19 @@ +# Assembly and Verilog simulation example + +This code demonstrates several concepts that could be helpful for testing your CPU: + +* cpu: Fake CPU Verilog - doesn't do much, but shows instructions loaded into memory and flowing through a pipeline +* asm: Turning assembly code into a memory image on the command line +* filters: GTKWave filter files that may be useful for various MIPS instruction fields, and a saved GTKWave session showing them all in use +* Various Makefiles demonstrating how these tasks can be automated + +At the root, run ```make``` to + +1. assemble the example program +1. compile the Verilog cpu into a vvp simulator +1. run the simulation, loading the assembly program into CPU memory + +You can run ```make scope``` to load the waveform in GTKWave with some nice filters added. + +```make clean``` removes all generated files (which you should not be committing in your own repos). + diff --git a/sim_example/asm/Makefile b/sim_example/asm/Makefile new file mode 100644 index 0000000..ab8c462 --- /dev/null +++ b/sim_example/asm/Makefile @@ -0,0 +1,27 @@ +# Generate machine code memory image from MIPS assembly + +# Get PROGRAM and MEMDUMP from project settings +include ../settings.mk + +MARS_PATH := ~/Documents/CompArch/mips/Mars4_5.jar +MARS_OPTS := a mc CompactTextAtZero +MARS := java -jar $(MARS_PATH) $(MARS_OPTS) + + +# Pattern rule for generating .text memory dump from MIPS assembly +%.text.hex: %.asm + $(MARS) dump .text HexText $@ $< + +# Pattern rule for generating .data memory dump from MIPS assembly +%.data.hex: %.asm + $(MARS) dump .data HexText $@ $< + + +# Shortcut (phony) targets for convenience +assemble: $(MEMDUMP) + +clean: + -rm -f $(MEMDUMP) + + +.PHONY: assemble clean diff --git a/sim_example/asm/fib_func.asm b/sim_example/asm/fib_func.asm new file mode 100644 index 0000000..d2d5279 --- /dev/null +++ b/sim_example/asm/fib_func.asm @@ -0,0 +1,132 @@ +# Function call example: recursive Fibonacci + +main: +# Set up arguments for call to fib_test +addi $a0, $zero, 4 # arg0 = 4 +addi $a1, $zero, 10 # arg1 = 10 +jal fib_test + +# Print result +add $a0, $zero, $v0 # Copy result into argument register a0 +jal print_result + +# Jump to "exit", rather than falling through to subroutines +j program_end + +#------------------------------------------------------------------------------ +# Fibonacci test function. Equivalent C code: +# int fib_test(arg0, arg1) { +# return Fibonacci(arg0) + Fibonacci(arg1); +# } +# By MIPS calling convention, expects arguments in +# registers a0 and a1, and returns result in register v0. +fib_test: +# We will use s0 and s1 registers in this function, plus the ra register +# to return at the end. Save them to stack in case caller was using them. +addi $sp, $sp, -12 # Allocate three words on stack at once for three pushes +sw $ra, 8($sp) # Push ra on the stack (will be overwritten by Fib function calls) +sw $s0, 4($sp) # Push s0 onto stack +sw $s1, 0($sp) # Push s1 onto stack + +# a1 may be overwritten by called functions, so save it to s1 (saved temporary), +# which called function won't change, so we can use it later for the second fib call +add $s1, $zero, $a1 + +# Call Fib(arg0), save result in s0 +# arg0 is already in register a0, placed there by caller of fib_test +jal fib # Call fib(4), returns in register v0 +add $s0, $zero, $v0 # Move result to s0 so we can call fib again without overwriting + +# Call Fib(arg1), save result in s1 +add $a0, $zero, $s1 # Move original arg1 into register a0 for function call +jal fib +add $s1, $zero, $v0 # Move result to s1 + +# Add Fib(arg0) and Fib(arg1) into v0 (return value for fib_test) +add $v0, $s0, $s1 + +# Restore original values to s0 and s1 registers from stack before returning +lw $s1, 0($sp) # Pop s1 from stack +lw $s0, 4($sp) # Pop s0 from stack +lw $ra, 8($sp) # Pop ra from the stack so we can return to caller +addi $sp, $sp, 12 # Adjust stack pointer to reflect pops + +jr $ra # Return to caller + +#------------------------------------------------------------------------------ +# Recursive Fibonacci function. Equivalent C code: +# +# int Fibonacci(int n) { +# if (n == 0) return 0; // Base case +# if (n == 1) return 1; // Base case +# int fib_1 = Fibonacci(n - 1); +# int fib_2 = Fibonacci(n - 2); +# return fib_1+fib_2; +# } +fib: +# Test base cases. If we're in a base case, return directly (no need to use stack) +bne $a0, 0, testone +add $v0, $zero, $zero # a0 == 0 -> return 0 +jr $ra +testone: +bne $a0, 1, fib_body +add $v0, $zero, $a0 # a0 == 1 -> return 1 +jr $ra + +fib_body: +# Create stack frame for fib: push ra and s0 +addi $sp, $sp, -8 # Allocate two words on stack at once for two pushes +sw $ra, 4($sp) # Push ra on the stack (will be overwritten by recursive function calls) +sw $s0, 0($sp) # Push s0 onto stack + +# Call Fib(n-1), save result in s0 +add $s0, $zero, $a0 # Save a0 argument (n) in register s0 +addi $a0, $a0, -1 # a0 = n-1 +jal fib +add $a0, $s0, -2 # a0 = n-2 +add $s0, $zero, $v0 # s0 = Fib(n-1) + +# Call Fib(n-2), compute final result +jal fib +add $v0, $v0, $s0 # v0 = Fib(n-2) + Fib(n-1) + +# Restore registers and pop stack frame +lw $ra, 4($sp) +lw $s0, 0($sp) +addi $sp, $sp, 8 + +jr $ra # Return to caller + +#------------------------------------------------------------------------------ +# Utility function to print results +print_result: +# Create stack frame for ra and s0 +addi $sp, $sp, -8 +sw $ra, 4($sp) +sw $s0, 0($sp) + +add $s0, $zero, $a0 # Save argument (integer to print) to s0 + +li $v0, 4 # Service code to print string +la $a0, result_str # Argument is memory address of string to print +syscall + +li $v0, 1 # Service code to print integer +add $a0, $zero, $s0 # Argument is integer to print +syscall + +# Restore registers and pop stack frame +lw $ra, 4($sp) +lw $s0, 0($sp) +addi $sp, $sp, 8 + +#------------------------------------------------------------------------------ +# Jump loop to end execution, so we don't fall through to .data section +program_end: +j program_end + + +#------------------------------------------------------------------------------ +.data +# Null-terminated string to print as part of result +result_str: .asciiz "\nFib(4)+Fib(10) = " diff --git a/sim_example/asm/quicksort.asm b/sim_example/asm/quicksort.asm new file mode 100644 index 0000000..501470a --- /dev/null +++ b/sim_example/asm/quicksort.asm @@ -0,0 +1,231 @@ +main: +addi $sp, $zero, 0x00003ffc +la $s0, array +addi $a0, $zero, 0 +addi $a1, $zero, 9 +jal quicksort +j done + + +quicksort: +# $s0 = arr* +# $a0 = start +# $a1 = end +# $t0 = pivot +# $t1 = branch check temporary (for xori and slt) + + +# if start < end, run quicksort +slt $t1, $a0, $a1 +bne $t1, $zero, run +j end + +run: +# push frame onto stack +addi $sp, $sp, -12 +sw $ra, 8($sp) +sw $a0, 4($sp) +sw $a1, ($sp) + +# pivot = partition (arr, start, end) +jal partition + +# pop frame from stack +lw $ra, 8($sp) +lw $a0, 4($sp) +lw $a1, ($sp) +addi $sp, $sp, 12 + +add $t0, $zero, $v0 + +# push frame onto stack +addi $sp, $sp, -16 +sw $ra, 12($sp) +sw $a0, 8($sp) +sw $a1, 4($sp) +sw $t0, ($sp) + +# quicksort(arr, start, pivot - 1) +addi $a1, $t0, -1 + +jal quicksort + +# pop frame from stack +lw $ra, 12($sp) +lw $a0, 8($sp) +lw $a1, 4($sp) +lw $t0, ($sp) +addi $sp, $sp, 16 + +# push frame onto stack +addi $sp, $sp, -12 +sw $ra, 8($sp) +sw $a0, 4($sp) +sw $a1, ($sp) + +# quicksort(arr, pivot + 1, end) +add $a0, $t0, 1 + +jal quicksort + +# pop frame from stack +lw $ra, 8($sp) +lw $a0, 4($sp) +lw $a1, ($sp) +addi $sp, $sp, 12 + +end: +jr $ra + + +partition: +# $v0 = return val +# $a0 = start +# $a1 = end +# $a2 = arr index (calcMemAddr) +# $s0 = arr* +# $s1 = pivot +# $s2 = i (counter) +# $s3 = j (counter) +# $s4 = arr[i] val +# $s5 = arr[j] val +# $t0 = branch check temporary (for xori and slt) +# $t3 = arr[end] addr +# $t4 = arr[i] addr +# $t5 = arr[j] addr + +addi $sp, $sp, -4 +sw $ra, ($sp) +# ----------------------------------------------------------------- +## int pivot = arr[end] +## int i = start - 1; + +# set arr index to end and call calcMemAddr +add $a2, $zero, $a1 +jal calcMemAddr +# set arr[end] addr +add $t3, $zero, $v0 + +# set reg pivot to mem[arr[end]] +lw $s1, ($t3) + +# set i to start - 1 +sub $s2, $a0, 1 + +# ----------------------------------------------------------------- +## for (int j = start; j < end; j++) { +## if (arr[j] <= pivot) { +## i++; +## int temp = arr[i]; +## arr[i] = arr[j]; +## arr[j] = temp; +## } +## } + +# set j to start and jump to check +add $s3, $zero, $a0 +j forcheck +forloop: + +# set arr index to j and call calcMemAddr +add $a2, $zero, $s3 +jal calcMemAddr +# set arr[j] addr +add $t5, $zero, $v0 + +# set reg arr[j] to mem[arr[j]] +lw $s5, ($t5) + +# check if arr[j] <= pivot +addi $s1, $s1, 1 +slt $t0, $s5, $s1 +addi $s1, $s1, -1 + +# execute swap if slt is true +bne $t0, $zero, swap +j increment +swap: +# i++ +addi $s2, $s2, 1 + +# set arr index to i and call calcMemAddr +add $a2, $zero, $s2 +jal calcMemAddr +# set arr[i] addr +add $t4, $zero, $v0 + +# set reg arr[i] to mem[arr[i]] +lw $s4, ($t4) + +# store in opposite places +sw $s4, ($t5) +sw $s5, ($t4) + +# increment j +increment: +addi $s3, $s3, 1 +# break for loop when j = end +forcheck: +bne $s3, $a1, forloop + +# ----------------------------------------------------------------- +## int temp = arr[i + 1]; +## arr[i + 1] = arr[end]; +## arr[end] = temp; +## return i + 1; + +# set i to i + 1, arr index to i + 1 and call calcMemAddr +addi $s2, $s2, 1 +add $a2, $zero, $s2 +jal calcMemAddr + +# set reg arr[i] to mem[arr[i + 1]] +lw $s4, ($v0) +# store pivot at mem[arr[i + 1]] +sw $s1, ($v0) + +# store reg arr[i] (holding arr[i + 1]) into mem[arr[end]] +sw $s4, ($t3) + +#return i + 1 +add $v0, $zero, $s2 +lw $ra, ($sp) +addi $sp, $sp, 4 +jr $ra + + +calcMemAddr: +# $v0 = addr (return) +# $t0 = multiply counter temporary +# $t1 = branch check temporary (for xori and slt) + +# set addr to arr* and mult counter to 0 +add $v0, $zero, $s0 +addi $t0, $zero, 0 +calc: +# add index to addr, 1 to mult counter +add $v0, $v0, $a2 +addi $t0, $t0, 1 +# if mult counter != 4, loop +xori $t1, $t0, 4 +bne $t1, $zero, calc +jr $ra + + +done: +j done +# addi $v0, $zero, 10 +# syscall + +.data +array: +0x00000009 +0x00000005 +0x00000003 +0x00000006 +0x00000002 +0x00000008 +0x00000007 +0x00000003 +0x00000001 +0x00000004 diff --git a/sim_example/filters/fake_cpu-fib_func.gtkw b/sim_example/filters/fake_cpu-fib_func.gtkw new file mode 100644 index 0000000..b39c5ac --- /dev/null +++ b/sim_example/filters/fake_cpu-fib_func.gtkw @@ -0,0 +1,52 @@ +[*] +[*] GTKWave Analyzer v3.3.79 (w)1999-2017 BSI +[*] Wed Nov 15 00:28:31 2017 +[*] +[dumpfile] "fake_cpu-fib_func.vcd" +[dumpfile_mtime] "Wed Nov 15 00:24:17 2017" +[dumpfile_size] 7517 +[savefile] "filters/fake_cpu-fib_func.gtkw" +[timestart] 0 +[size] 1000 600 +[pos] -1 -1 +*-5.270822 57 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +[treeopen] cpu_test. +[treeopen] cpu_test.cpu. +[sst_width] 193 +[signals_width] 133 +[sst_expanded] 1 +[sst_vpaned_height] 168 +@28 +cpu_test.clk +cpu_test.reset +@200 +---A-- +@22 +cpu_test.cpu.PC_A[31:0] +cpu_test.cpu.INS_A[31:0] +@200 +---B-- +@22 +cpu_test.cpu.PC_B[31:0] +cpu_test.cpu.INS_B[31:0] +@2022 +^2 filters/mips-opcodes.filter +cpu_test.cpu.OP_B[5:0] +@200 +---C-- +@22 +cpu_test.cpu.PC_C[31:0] +cpu_test.cpu.INS_C[31:0] +@2022 +^4 filters/mips-funct.filter +cpu_test.cpu.FUNCT_C[5:0] +@2023 +^2 filters/mips-opcodes.filter +cpu_test.cpu.OP_C[5:0] +@2022 +^1 filters/mips-regs.filter +cpu_test.cpu.RS_C[4:0] +^1 filters/mips-regs.filter +cpu_test.cpu.RT_C[4:0] +[pattern_trace] 1 +[pattern_trace] 0 diff --git a/sim_example/filters/mips-funct.filter b/sim_example/filters/mips-funct.filter new file mode 100644 index 0000000..654e706 --- /dev/null +++ b/sim_example/filters/mips-funct.filter @@ -0,0 +1,21 @@ +# MIPS funct codes (R-type, OP=0x00) +00 sll +02 srl +03 sra +08 jr +10 mfhi +12 mflo +18 mult +19 multu +1A div +1B divu +20 add +21 addu +22 sub +23 subu +24 and +25 or +26 xor +27 nor +2A slt +2B sltu diff --git a/sim_example/filters/mips-opcodes.filter b/sim_example/filters/mips-opcodes.filter new file mode 100644 index 0000000..fedc8cc --- /dev/null +++ b/sim_example/filters/mips-opcodes.filter @@ -0,0 +1,20 @@ +# MIPS opcodes +00 R-type +02 j +03 jal +04 beq +05 bne +08 addi +09 addiu +0A slti +0B sltiu +0C andi +0D ori +0F lui +10 mfc0 +23 lw +24 lbu +25 lhu +28 sb +29 sh +2B sw diff --git a/sim_example/filters/mips-regs.filter b/sim_example/filters/mips-regs.filter new file mode 100644 index 0000000..f704bdb --- /dev/null +++ b/sim_example/filters/mips-regs.filter @@ -0,0 +1,33 @@ +# MIPS register names +00 $zero +01 $at +02 $v0 +03 $v1 +04 $a0 +05 $a1 +06 $a2 +07 $a3 +08 $t0 +09 $t1 +0A $t2 +0B $t3 +0C $t4 +0D $t5 +0E $t6 +0F $t7 +10 $s0 +11 $s1 +12 $s2 +13 $s3 +14 $s4 +15 $s5 +16 $s6 +17 $s7 +18 $t8 +19 $t9 +1a $k0 +1b $k1 +1c $gp +1d $sp +1e $fp +1f $ra diff --git a/sim_example/settings.mk b/sim_example/settings.mk new file mode 100644 index 0000000..3dd73db --- /dev/null +++ b/sim_example/settings.mk @@ -0,0 +1,18 @@ +# Project-specific settings + +## Assembly settings + +# Assembly program (minus .asm extension) +PROGRAM := quicksort + +# Memory image(s) to create from the assembly program +MEMDUMP := $(PROGRAM).text.hex + + +## Verilog settings + +# Top-level module/filename (minus .v/.t.v extension) +TOPLEVEL := fake_cpu + +# All circuits included by the toplevel $(TOPLEVEL).t.v +CIRCUITS := $(TOPLEVEL).v counter.v diff --git a/sim_example/verilog/Makefile b/sim_example/verilog/Makefile new file mode 100644 index 0000000..43374fb --- /dev/null +++ b/sim_example/verilog/Makefile @@ -0,0 +1,23 @@ +# Verilog compilation Makefile example + +# Get TOPLEVEL and CIRCUITS variables from project settings +include ../settings.mk + +ICARUS_OPTS := -Wall +IVERILOG := iverilog $(ICARUS_OPTS) +LINT := verilator --lint-only +SIM := vvp + + +# Pattern rule for compiling vvp (Icarus assembly) from a testbench +%.vvp: %.t.v $(CIRCUITS) + $(IVERILOG) -o $@ $< + + +# Shortcut (phony) targets for convenience +compile: $(TOPLEVEL).vvp + +clean: + -rm -f $(TOPLEVEL).vvp + +.PHONY: compile clean diff --git a/sim_example/verilog/counter.v b/sim_example/verilog/counter.v new file mode 100644 index 0000000..546199b --- /dev/null +++ b/sim_example/verilog/counter.v @@ -0,0 +1,22 @@ +//------------------------------------------------------------------------ +// Simple resettable up-counter +//------------------------------------------------------------------------ + +module counter +#( + parameter width = 16, + parameter increment = 1, + parameter init_val = 0 +) +( + output reg [width-1:0] count, + input clk, + input reset +); + + always @(posedge clk, posedge reset) begin + if (reset) count <= init_val; + else count <= count + increment; + end + +endmodule diff --git a/sim_example/verilog/fake_cpu.t.v b/sim_example/verilog/fake_cpu.t.v new file mode 100644 index 0000000..5d8d31c --- /dev/null +++ b/sim_example/verilog/fake_cpu.t.v @@ -0,0 +1,72 @@ +`include "fake_cpu.v" + +//------------------------------------------------------------------------ +// Simple fake CPU testbench sequence +//------------------------------------------------------------------------ + +module cpu_test (); + + reg clk; + reg reset; + + // Clock generation + initial clk=0; + always #10 clk = !clk; + + // Instantiate fake CPU + fake_cpu cpu(.clk(clk), .reset(reset)); + + + reg [1023:0] mem_fn; + reg [1023:0] dump_fn; + + // Test sequence + initial begin + + // Get command line arguments for memory image and VCD dump file + // http://iverilog.wikia.com/wiki/Simulation + // http://www.project-veripage.com/plusarg.php + if (! $value$plusargs("mem_fn=%s", mem_fn)) begin + $display("ERROR: provide +mem_fn=[path to memory image] argument"); + $finish(); + end + if (! $value$plusargs("dump_fn=%s", dump_fn)) begin + $display("ERROR: provide +dump_fn=[path for VCD dump] argument"); + $finish(); + end + + + // Load CPU memory from (assembly) dump file + $readmemh(mem_fn, cpu.memory); + // Alternate: Explicitly state which array element range to read into + //$readmemh("mymem.hex", memory, 10, 80); + + // Dump waveforms to file + // Note: arrays (e.g. memory) are not dumped by default + $dumpfile(dump_fn); + $dumpvars(); + + // Assert reset pulse + reset = 0; #10; + reset = 1; #10; + reset = 0; #10; + + // Display a few cycles just for quick checking + // Note: I'm just dumping instruction bits, but you can do some + // self-checking test cases based on your CPU and program and + // automatically report the results. + $display("Time | PC | Instruction"); + repeat(3) begin + $display("%4t | %h | %h", $time, cpu.PC_A, cpu.INS_A); #20 ; + end + $display("... more execution (see waveform)"); + + // End execution after some time delay - adjust to match your program + // or use a smarter approach like looking for an exit syscall or the + // PC to be the value of the last instruction in your program. + #2000 $finish(); + end + +endmodule + + diff --git a/sim_example/verilog/fake_cpu.v b/sim_example/verilog/fake_cpu.v new file mode 100644 index 0000000..5472f4b --- /dev/null +++ b/sim_example/verilog/fake_cpu.v @@ -0,0 +1,84 @@ +`include "counter.v" + +//------------------------------------------------------------------------ +// Fake CPU with three "pipeline stages" A -> B -> C +//------------------------------------------------------------------------ + +module fake_cpu +( + input clk, + input reset +); + + //-------------------------------------------------------------------- + // Stage A - "Instruction Fetch" + + wire [31:0] PC_A; + wire [31:0] INS_A; + + // Simplified PC generation unit - increments by 4 every cycle + counter #(.width(32), .increment(4)) pc_incr(.count(PC_A), + .clk(clk), + .reset(reset)); + + + // 16KiB memory, organized as 4096 element array of 32-bit words + reg [31:0] memory [4095:0]; + // Alternate: 16KiB memory, organized as 16384 element array of bytes + // reg [7:0] memory [2**14-1:0]; + + + // Simplified memory "read port" + assign INS_A = memory[ PC_A[13:2] ]; + // Note: Discards the low 2 bits of the PC (should be zero) since I've + // implemented my memory as an array of words instead of bytes. Discards + // upper 18 bits of PC (should be zero) because my memory is only 16 KiB + // (smaller than maximum addressible 2^32 bytes). + + // Non-synthesizable debugging code for checking assertions about PC + always @(PC_A) begin + if (| PC_A[1:0]) begin // Lower PC bits != 00 + $display("Warning: misaligned PC access, truncating: %h", PC_A); + end + if (| PC_A[31:14]) begin // Upper PC bits non-zero + $display("Error: PC outside implemented memory range: %h", PC_A); + $stop(); + end + end + + //-------------------------------------------------------------------- + // Stages B and C - fake functionality to see more signals propagate + + reg [31:0] PC_B, PC_C; + reg [31:0] INS_B, INS_C; + + // Op-code is the upper 6 bits, for all instruction formats + wire [5:0] OP_B; + reg [5:0] OP_C; + assign OP_B = INS_B[31:26]; + + // Funct code is the lowest 6 bits for R type (not meaningful for others) + wire [5:0] FUNCT_C; + assign FUNCT_C = INS_C[5:0]; + + // Register addresses (not meaningful for J-type instructions) + wire [4:0] RS_C, RT_C; + assign RS_C = INS_C[25:21]; + assign RT_C = INS_C[20:16]; + + + //-------------------------------------------------------------------- + // Registers between pipeline stages + + always @(posedge clk) begin + // A-B registers + PC_B <= PC_A; + INS_B <= INS_A; + + // B-C registers + PC_C <= PC_B; + INS_C <= INS_B; + OP_C <= OP_B; + end + +endmodule diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..f66b55b --- /dev/null +++ b/test.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +make clean +make +gtkwave filters/singleCycleCPU-basic_testbench.gtkw diff --git a/verilog/Makefile b/verilog/Makefile new file mode 100644 index 0000000..43374fb --- /dev/null +++ b/verilog/Makefile @@ -0,0 +1,23 @@ +# Verilog compilation Makefile example + +# Get TOPLEVEL and CIRCUITS variables from project settings +include ../settings.mk + +ICARUS_OPTS := -Wall +IVERILOG := iverilog $(ICARUS_OPTS) +LINT := verilator --lint-only +SIM := vvp + + +# Pattern rule for compiling vvp (Icarus assembly) from a testbench +%.vvp: %.t.v $(CIRCUITS) + $(IVERILOG) -o $@ $< + + +# Shortcut (phony) targets for convenience +compile: $(TOPLEVEL).vvp + +clean: + -rm -f $(TOPLEVEL).vvp + +.PHONY: compile clean diff --git a/verilog/Memories/addingtest.dat b/verilog/Memories/addingtest.dat new file mode 100644 index 0000000..278c9e5 --- /dev/null +++ b/verilog/Memories/addingtest.dat @@ -0,0 +1,9 @@ +2008000a +20090009 +200a000d +200b0010 +200c000c +01096820 +01aa6820 +01ab6820 +01ac6820 diff --git a/verilog/Memories/mips1.asm b/verilog/Memories/mips1.asm new file mode 100644 index 0000000..84d55d3 --- /dev/null +++ b/verilog/Memories/mips1.asm @@ -0,0 +1,11 @@ + +addi $t0, $zero, 10 +addi $t1, $zero, 9 +addi $t2, $zero, 13 +addi $t3, $zero, 16 +addi $t4, $zero, 12 + +add $t5, $t0, $t1 +add $t5, $t5, $t2 +add $t5, $t5, $t3 +add $t5, $t5, $t4 diff --git a/verilog/alu.t.v b/verilog/alu.t.v new file mode 100644 index 0000000..50b903e --- /dev/null +++ b/verilog/alu.t.v @@ -0,0 +1,287 @@ +//Test harness for testing 32 bit ALU +`define code_ADD 3'b000 +`define code_SUB 3'b001 +`define code_XOR 3'b010 +`define code_SLT 3'b011 +`define code_AND 3'b100 +`define code_NAND 3'b101 +`define code_NOR 3'b110 +`define code_OR 3'b111 + +`include "alu.v" + +module ALUTestHarness (); + // Declare registers for inputs + reg signed [31:0] A, B; + reg[2:0] command; + + // Declare output wires + wire cout, ovf, zero; + wire[31:0] out; + + // Instantiate DUT + ALU alu (out, cout, ovf, zero, A, B, command); + + // Declare helper variable registers + + // Set of operands to loop through for ADD, SUB and SLT + reg[191:0] a_vals = { + 32'd400000000, + 32'd1500000000, + -32'd300000000, + -32'd1000000000, + -32'd2147483647, + 32'd5000 + }; + reg[191:0] b_vals = { + 32'd500000000, + 32'd1000000000, + -32'd100000000, + -32'd2000000000, + 32'd2147483647, + 32'd5000 + }; + + // Expected output flags + // cout | ovf | zero + reg[17:0] add_res = { + 3'b000, + 3'b010, + 3'b100, + 3'b110, + 3'b101, + 3'b000 + }; + + // Expected output flags + // cout | ovf | zero + reg[17:0] sub_res = { + 3'b000, + 3'b100, + 3'b000, + 3'b100, + 3'b110, + 3'b101 + }; + + reg[2:0] logic_index; + reg ex_cout, ex_ovf, ex_zero; + reg[3:0] add_index; + reg[15:0] testfailed; + + initial begin + $dumpfile("alu.vcd"); + $dumpvars(0, alu); + testfailed = 0; + + $display("Test Commence"); + + // Test Worst Case Delay + // Setup by doing SLT on -2147483648 1 + A = -2147483648; B = 1; command = `code_SLT; #2000 + + if ( out != 32'b1 ) begin + testfailed = testfailed +1; + $display("Test SLT A:%b B:%b Failed, Expected Out:%b, Got Out:%b", A, B, 32'b1, out); + end + + // Next measure delay for SLT on 0 0 + A = 0; B = 0; #2000 + if ( out != 32'b0 ) begin + testfailed = testfailed +1; + $display("Test SLT A:%b B:%b Failed, Expected Out:%b, Got Out:%b", A, B, 32'b0, out); + end + + + // Test block logic to check all of the bits + command = `code_XOR; + for (logic_index = 0; logic_index < 4; logic_index = logic_index + 1) begin + A = (logic_index[0]==0) ? 32'd0 : 32'd2147483647; + B = (logic_index[1]==0) ? 32'd0 : 32'd2147483647;#2000 + if (out != (A^B)) begin + testfailed = testfailed +1; + $display("Test XOR A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, A^B, out); + end + if (cout == 1 || ovf == 1 || zero == 1) begin + testfailed = testfailed +1; + $display("Test XOR A:%b B:%b Failed, Produced Flags cout:%b, ovf:%b, zero:%b", A, B, cout, ovf, zero); + end + end + + command = `code_AND; + for (logic_index = 0; logic_index < 4; logic_index = logic_index + 1) begin + A = (logic_index[0]==0) ? 32'd0 : -32'd1; + B = (logic_index[1]==0) ? 32'd0 : -32'd1;#2000 + if (out != (A&B)) begin + testfailed = testfailed +1; + $display("Test AND A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, A&B, out); + end + if (cout == 1 || ovf == 1 || zero == 1) begin + testfailed = testfailed +1; + $display("Test AND A:%b B:%b Failed, Produced Flags cout:%b, ovf:%b, zero:%b", A, B, cout, ovf, zero); + end + end + + command = `code_NAND; + for (logic_index = 0; logic_index < 4; logic_index = logic_index + 1) begin + A = (logic_index[0]==0) ? 32'd0 : -32'd1; + B = (logic_index[1]==0) ? 32'd0 : -32'd1;#2000 + if (out != (A~&B)) begin + testfailed = testfailed +1; + $display("Test NAND A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, A~&B, out); + end + if (cout == 1 || ovf == 1 || zero == 1) begin + testfailed = testfailed +1; + $display("Test NAND A:%b B:%b Failed, Produced Flags cout:%b, ovf:%b, zero:%b", A, B, cout, ovf, zero); + end + end + + command = `code_NOR; + for (logic_index = 0; logic_index < 4; logic_index = logic_index + 1) begin + A = (logic_index[0]==0) ? 32'd0 : -32'd1; + B = (logic_index[1]==0) ? 32'd0 : -32'd1;#2000 + if (out != (A~|B)) begin + testfailed = testfailed +1; + $display("Test NOR A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, A~|B, out); + end + if (cout == 1 || ovf == 1 || zero == 1) begin + testfailed = testfailed +1; + $display("Test NOR A:%b B:%b Failed, Produced Flags cout:%b, ovf:%b, zero:%b", A, B, cout, ovf, zero); + end + end + + command = `code_OR; + for (logic_index = 0; logic_index < 4; logic_index = logic_index + 1) begin + A = (logic_index[0]==0) ? 32'd0 : -32'd1; + B = (logic_index[1]==0) ? 32'd0 : -32'd1;#2000 + if (out != (A|B)) begin + testfailed = testfailed +1; + $display("Test OR A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, A|B, out); + end + if (cout == 1 || ovf == 1 || zero == 1) begin + testfailed = testfailed +1; + $display("Test OR A:%b B:%b Failed, Produced Flags cout:%b, ovf:%b, zero:%b", A, B, cout, ovf, zero); + end + end + + + // Test all signals in ADD by using all bits for a non-zero result + command = `code_ADD; + // 0111... + 1111... + A = 2147483647; B = -1;#2000 + if (out != 2147483646) begin + testfailed = testfailed +1; + $display("Test ADD A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, 2147483646, out); + end + + // 1111... + 1000... + A = -1; B = 32'b1<<31;#2000 + if (out != 2147483647) begin + testfailed = testfailed +1; + $display("Test ADD A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, 2147483647, out); + end + + // Test a few internal carries for ADD + // ...0001 + ...0001 + A = 32'b1; B = 32'b1;#2000 + if (out != 32'b10) begin + testfailed = testfailed +1; + $display("Test ADD A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, 32'b10, out); + end + + // ...0010 + 0010 + A = 32'b10; B =32'b10;#2000 + if (out != 32'b100) begin + testfailed = testfailed +1; + $display("Test ADD A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, 32'b100, out); + end + + // 0100... + 0100... + A = 32'b1<<30; B = 32'b1<<30 ;#2000 + if (out != 32'b1<<31) begin + testfailed = testfailed +1; + $display("Test ADD A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, 32'b1<<31, out); + end + + // 1000... + 1000... + A = 32'b1<<31; B = 32'b1<<31 ;#2000 + if (out != 32'b0) begin + testfailed = testfailed +1; + $display("Test ADD A:%b B:%b Failed, Expected Out:%b Got Out:%b", A, B, 32'b0, out); + end + + + // ADD SUB SLT interesting cases + command = `code_ADD; + for (add_index = 0; add_index<6; add_index = add_index + 1) begin + A = a_vals[((add_index*32)-1)-:32]; // Grab the relevant chunk of the register of queued operations + B = b_vals[((add_index*32)-1)-:32];#2000 + {ex_cout,ex_ovf,ex_zero} = add_res[((add_index*3)-1)-:3]; + + if (out != (A+B)) begin + testfailed = testfailed +1; + $display("Test ADD A:%d B:%d Failed, Expected Out:%d Got Out:%d", A, B, A+B, out); + end + if (cout != ex_cout) begin + testfailed = testfailed +1; + $display("Test ADD A:%d B:%d Failed, Expected cout:%d Got cout:%d", A, B, ex_cout, cout); + end + if (ovf != ex_ovf) begin + testfailed = testfailed +1; + $display("Test ADD A:%d B:%d Failed, Expected ovf:%d Got ovf:%d", A, B, ex_ovf, ovf); + end + if (zero != ex_zero) begin + testfailed = testfailed +1; + $display("Test ADD A:%d B:%d Failed, Expected zero:%d Got zero:%d", A, B, ex_zero, zero); + end + end + + //SUB + command = `code_SUB; + for (add_index = 0; add_index<6; add_index = add_index + 1) begin + A = a_vals[((add_index*32)-1)-:32]; // Grab the relevant chunk of the register of queued operations + B = b_vals[((add_index*32)-1)-:32];#2000 + {ex_cout,ex_ovf,ex_zero} = sub_res[((add_index*3)-1)-:3]; + + if (out != (A-B)) begin + testfailed = testfailed +1; + $display("Test SUB A:%d B:%d Failed, Expected Out:%d Got Out:%d", A, B, A-B, out); + end + if (cout != ex_cout) begin + testfailed = testfailed +1; + $display("Test SUB A:%d B:%d Failed, Expected cout:%d Got cout:%d", A, B, ex_cout, cout); + end + if (ovf != ex_ovf) begin + testfailed = testfailed +1; + $display("Test SUB A:%d B:%d Failed, Expected ovf:%d Got ovf:%d", A, B, ex_ovf, ovf); + end + if (zero != ex_zero) begin + testfailed = testfailed +1; + $display("Test SUB A:%d B:%d Failed, Expected zero:%d Got zero:%d", A, B, ex_zero, zero); + end + end + + //SLT + command = `code_SLT; + for (add_index = 0; add_index<6; add_index = add_index + 1) begin + A = a_vals[((add_index*32)-1)-:32]; // Grab the relevant chunk of the register of queued operations + B = b_vals[((add_index*32)-1)-:32];#2000 + + if (out != ((A 0) begin + $display(" %d Tests Failed", testfailed); + end else begin + $display(" Tests Passed!"); + end + + end +endmodule diff --git a/verilog/alu.v b/verilog/alu.v new file mode 100644 index 0000000..9e900bf --- /dev/null +++ b/verilog/alu.v @@ -0,0 +1,73 @@ +//------------------------------------------------------------------------------ +// Arithmetic Logic Unit +// 2 inputs of width: 32 bits +// input: 3 bit control signal +// output width: 32 bits +// output: carryout, overflow, zero flags +//------------------------------------------------------------------------------ + +module ALU +( + output reg[31:0] out, + output reg carryout, ovf, zero, + input signed[31:0] a, + input signed[31:0] b, + input[2:0] cmd +); + localparam + ADD = 3'd0, + SUB = 3'd1, + XOR = 3'd2, + SLT = 3'd3, + AND = 3'd4, + NAND = 3'd5, + NOR = 3'd6, + OR = 3'd7; + + always @(*) begin + case (cmd) + ADD: begin + {carryout, out} = {1'b0, a} + {1'b0, b}; + ovf = (a[31] ~^ b[31]) && (a[31] ^ out[31]) ? 1 : 0; + zero = (a + b == 0) ? 1 : 0; + end + + SUB: begin + {carryout, out} = {1'b0, a} + {1'b0, ~b} + 32'b1; + ovf = (a[31] ^ b[31]) && (a[31] ^ out[31]) ? 1 : 0; + zero = (a - b == 0) ? 1 : 0; + end + + XOR: begin + out = a ^ b; + {carryout, ovf, zero} = 3'b0; + end + + SLT: begin + out = (a < b) ? 32'b1 : 0; + {carryout, ovf, zero} = 3'b0; + end + + AND: begin + out = a & b; + {carryout, ovf, zero} = 3'b0; + end + + NAND: begin + out = a ~& b; + {carryout, ovf, zero} = 3'b0; + end + + NOR: begin + out = a ~| b; + {carryout, ovf, zero} = 3'b0; + end + + OR: begin + out = a | b; + {carryout, ovf, zero} = 3'b0; + end + endcase + end + +endmodule diff --git a/verilog/dataMemory.v b/verilog/dataMemory.v new file mode 100644 index 0000000..581b65b --- /dev/null +++ b/verilog/dataMemory.v @@ -0,0 +1,38 @@ +//------------------------------------------------------------------------ +// Data Memory +// Positive edge triggered +// dataOut always has the value mem[address] +// If writeEnable is true, writes dataIn to mem[address] +//------------------------------------------------------------------------ + +module dataMemory +#( + parameter addresswidth = 7, + parameter depth = 2**addresswidth, + parameter width = 8 +) +( + output reg [width-1:0] dataOut, + output reg [width-1:0] InstrOut, + input [addresswidth-1:0] address, + input [addresswidth-1:0] InstrAddr, + input writeEnable, + input [width-1:0] dataIn, + input clk +); + reg [width-1:0] memory [depth-1:0]; + + always @(address) begin + dataOut <= memory[address]; + end + + always @(InstrAddr) begin + InstrOut <= memory[InstrAddr]; + end + + always @(negedge clk) begin + if(writeEnable) + memory[address] <= dataIn; + end + +endmodule diff --git a/verilog/dataPath.v b/verilog/dataPath.v new file mode 100644 index 0000000..3b34be5 --- /dev/null +++ b/verilog/dataPath.v @@ -0,0 +1,74 @@ +//------------------------------------------------------------------------------ +// Instruction decoder module +//------------------------------------------------------------------------------ + +`include "alu.v" +//`include "dataMemory.v" +`include "regfile.v" +`include "signExtend.v" + +module dataPath( + output carryout, ovf, zero, + output [31:0] Da, + output [31:0] ALU_out, + output [31:0] Db, + input [31:0] mem_dout, + input [31:0] PC, + input [4:0] Rs, + input [4:0] Rt, + input [4:0] Rd, + input [15:0] imm16, + input reg_wr, + input reg_dst, + input ALU_src, + input [2:0] ALU_ctrl, + //input mem_wr, + input mem_to_reg, + input jl, + input jal, + input jr, + input branch, + input zero_ext, + input clk + ); + + // Declare internal wires + wire [4:0] dest_reg; // Output of reg_dst mux + wire [4:0] Aw; // Output of jal mux + wire [31:0] Dw; // Output of writeback/PC+8 mux for JAL + wire [31:0] se_ze_imm16; // Output of sign/zero extender + wire [31:0] A, B; // Inputs to ALU + //wire [31:0] ALU_out; // Output of ALU + wire [31:0] writeback; // Output of mem_to_reg mux + wire [31:0] mem_dout; // Output of memory + + // Set up MUXes for regfile write address + assign dest_reg = reg_dst ? Rd : Rt; + assign Aw = jal ? 5'd31 : dest_reg; + + // Set up jal link register mux + assign Dw = jal ? (PC + 32'd1) : writeback; + + // Set up regfile + regfile reg_file(Da, Db, Dw, Rs, Rt, Aw, reg_wr, clk); + + // Set up ALU immediate/register source mux + assign B = ALU_src ? se_ze_imm16 : Db; + signExtend sign_extend(se_ze_imm16, imm16, zero_ext); + + // Set up ALU + assign A = Da; + ALU alu(ALU_out, carryout, ovf, zero, A, B, ALU_ctrl); + + // Set up data memory + //dataMemory #(32,32'h4000,32) data_mem (mem_dout, ALU_out, mem_wr, Db, clk); + // Set up load/result mux + assign writeback = mem_to_reg ? mem_dout : ALU_out; + + + + + + + +endmodule diff --git a/verilog/instructionDecode.v b/verilog/instructionDecode.v new file mode 100644 index 0000000..3d3eed8 --- /dev/null +++ b/verilog/instructionDecode.v @@ -0,0 +1,59 @@ +//------------------------------------------------------------------------------ +// Instruction decoder module +//------------------------------------------------------------------------------ + +`include "opcodeDecode.v" + +module instructionDecode( + output [25:0] target_address, + output [4:0] rs, + output [4:0] rt, + output [4:0] rd, + output [15:0] imm16, + output reg_wr, + output reg_dst, + output ALU_src, + output [2:0] ALU_ctrl, + output mem_wr, + output mem_to_reg, + output jl, + output jal, + output jr, + output branch, + output zero_ext, + input [31:0] instruction + ); + + wire [5:0] opcode, funct; + + // R-type instructions + assign opcode = instruction[31:26]; + assign rs = instruction[25:21]; + assign rt = instruction[20:16]; + assign rd = instruction[15:11]; + assign funct = instruction[5:0]; + + // J-type instructions + assign imm16 = instruction[15:0]; + + // I-type instructions + assign target_address = instruction[25:0]; + + // Instantiate LUT for opcodes + opcodeDecode op_decoder( + .reg_wr(reg_wr), + .reg_dst(reg_dst), + .ALU_src(ALU_src), + .ALU_ctrl(ALU_ctrl), + .mem_wr(mem_wr), + .mem_to_reg(mem_to_reg), + .jl(jl), + .jal(jal), + .jr(jr), + .branch(branch), + .zero_ext(zero_ext), + .opcode(opcode), + .funct(funct) + ); + +endmodule diff --git a/verilog/instructionFetch.v b/verilog/instructionFetch.v new file mode 100644 index 0000000..362b407 --- /dev/null +++ b/verilog/instructionFetch.v @@ -0,0 +1,40 @@ +`include "regfile-dependencies/register32.v" +//`include "signExtend.v" +//`include "instructionMemory.v" + +module instructionFetch +( + //output[31:0] Instr, + output [31:0] PC, + input[25:0] TargetAddr, + input[15:0] Imm16, + input zero, + input Branch, + input[31:0] Da, + input jr, + input jl, + input clk +); + //initial PC = 29'b0; + //Jumping + wire[31:0] newAddr; + wire[31:0] jumpaddr; + wire[31:0] addunit; + wire[31:0] added; + wire[31:0] same_branch_addr; + wire[31:0] signextimm; + wire muxsig1; + wire[31:0] regwrite; + + //wire nextAddr; + register32 PC_module (PC, newAddr, 1'b1, clk); + signExtend IF_SE (signextimm, Imm16, 1'b0); + //instructionMemory InstMem(Instr, {PC[31:2], 2'b00}, clk); + assign jumpaddr = {PC[29:26],TargetAddr}; + assign muxsig1 = (!zero && Branch); + assign addunit = muxsig1 ? signextimm : 32'b0; + assign added = addunit + PC + 1; + assign same_branch_addr = jr ? Da : added; + assign newAddr = jl ? jumpaddr : same_branch_addr; + +endmodule diff --git a/verilog/instructionMemory.v b/verilog/instructionMemory.v new file mode 100644 index 0000000..335f609 --- /dev/null +++ b/verilog/instructionMemory.v @@ -0,0 +1,22 @@ +module instructionMemory +( + output[31:0] DataOut, + //input regWE, //for actual memory + input[9:0] Addr, + //input[31:0] DataIn, //this is for the actual memory + input clk +); + + reg[31:0] mem[1023:0]; + + + initial $readmemh("Memories/addingtest.dat", mem);//Memories/mips1.asm + + /*always @(posedge clk) begin + if (regWE) begin + mem[Addr] <= DataIn; + end + end*/ //This is for the actual memory + + assign DataOut = mem[Addr]; +endmodule diff --git a/verilog/opcodeDecode.v b/verilog/opcodeDecode.v new file mode 100644 index 0000000..8587c78 --- /dev/null +++ b/verilog/opcodeDecode.v @@ -0,0 +1,80 @@ +//----------------------------------------------------------------------------- +// Instruction decode opcode decoder module +//----------------------------------------------------------------------------- + +module opcodeDecode( + output reg reg_wr, + output reg reg_dst, + output reg ALU_src, + output reg [2:0] ALU_ctrl, + output reg mem_wr, + output reg mem_to_reg, + output reg jl, + output reg jal, + output reg jr, + output reg branch, + output reg zero_ext, + input [5:0] opcode, + input [5:0] funct + ); + + // Define opcode localparams + localparam + LW = 6'h23, + SW = 6'h2b, + J = 6'h2, + JAL = 6'h3, + BNE = 6'h5, + XORI = 6'he, + ADDI = 6'h8, + RTYPE = 6'h0; + + // define funct localparams + localparam + r_jr = 6'h8, + r_add = 6'h20, + r_sub = 6'h22, + r_slt = 6'h2a; + + // Concatenate all signals into 1 + // reg_wr, reg_dst, ALU_src, ALU_ctrl, mem_wr, mem_to_reg, jl, jal, jr, branch, zero_ext + reg [12:0] control; + + // combinational block + always @(*) begin + case (opcode) + LW: + control = 13'b1_0100_0010_0000; + SW: + control = 13'b0_0100_0100_0000; + J: + control = 13'b0_0000_0001_0000; + JAL: + control = 13'b1_0000_0001_1000; + BNE: + control = 13'b0_0000_1000_0010; + XORI: + control = 13'b1_0101_0000_0001; + ADDI: + control = 13'b1_0100_0000_0000; + RTYPE: // For R-type instructions we need to check the funct bits + case (funct) + r_jr: + control = 13'b0_0000_0000_0100; + r_add: + control = 13'b1_1000_0000_0000; + r_sub: + control = 13'b1_1000_1000_0000; + r_slt: + control = 13'b1_1001_1000_0000; + default: + control = 13'b0; + endcase + default: // Just NOP if something goes screwy here + control = 13'b0; + endcase + // Split out control signal assignment + {reg_wr, reg_dst, ALU_src, ALU_ctrl, mem_wr, mem_to_reg, jl, jal, jr, branch, zero_ext} = control; + end + +endmodule diff --git a/verilog/regfile-dependencies/decoders.v b/verilog/regfile-dependencies/decoders.v new file mode 100644 index 0000000..bd759c9 --- /dev/null +++ b/verilog/regfile-dependencies/decoders.v @@ -0,0 +1,14 @@ +// 32 bit decoder with enable signal +// enable=0: all output bits are 0 +// enable=1: out[address] is 1, all other outputs are 0 +/* Verilator lint_off WIDTH */ +module decoder1to32 +( +output[31:0] out, +input enable, +input[4:0] address +); + + assign out = enable<