]> Shamusworld >> Repos - virtualjaguar/blobdiff - src/gpu.cpp
Fixed IMASK access in DSP & GPU, more CPU browser refinements.
[virtualjaguar] / src / gpu.cpp
index 461017132812d5612a08c847ec314d5183314b22..c57b948d5088751dd169e4d20f0d5e89a7a11a3c 100644 (file)
@@ -1,9 +1,21 @@
+#if 1
+
 //
 // GPU Core
 //
 // Originally by David Raingeard (Cal2)
 // GCC/SDL port by Niels Wagenaar (Linux/WIN32) and Caz (BeOS)
-// Cleanups, endian wrongness, and bad ASM amelioration by James L. Hammons
+// Cleanups, endian wrongness, and bad ASM amelioration by James Hammons
+// (C) 2010 Underground Software
+//
+// JLH = James Hammons <jlhamm@acm.org>
+//
+// Who  When        What
+// ---  ----------  -------------------------------------------------------------
+// JLH  01/16/2010  Created this log ;-)
+// JLH  11/26/2011  Added fixes for LOAD/STORE alignment issues
+
+//
 // Note: Endian wrongness probably stems from the MAME origins of this emu and
 //       the braindead way in which MAME handles memory. :-)
 //
 
 #include "gpu.h"
 
-#include <string.h>                                                            // For memset
 #include <stdlib.h>
-#include "log.h"
+#include <string.h>                                                            // For memset
+#include "dsp.h"
+#include "jagdasm.h"
 #include "jaguar.h"
-#include "m68k.h"
+#include "log.h"
+#include "m68000/m68kinterface.h"
+//#include "memory.h"
 #include "tom.h"
-#include "memory.h"
-#include "jagdasm.h"
-#include "dsp.h"
 
+
+// Seems alignment in loads & stores was off...
+#define GPU_CORRECT_ALIGNMENT
 //#define GPU_DEBUG
 
 // For GPU dissasembly...
 
+#if 0
 #define GPU_DIS_ABS
 #define GPU_DIS_ADD
 #define GPU_DIS_ADDC
 #define GPU_DIS_SUBQT
 #define GPU_DIS_XOR
 
-bool doGPUDis = false;
-//bool doGPUDis = true;
-//*/
+//bool doGPUDis = false;
+bool doGPUDis = true;
+#endif
+
 /*
 GPU opcodes use (BIOS flying ATARI logo):
 +                    add 357416
@@ -163,7 +180,6 @@ extern int gpu_start_log;
 // Private function prototypes
 
 void GPUUpdateRegisterBanks(void);
-
 void GPUDumpDisassembly(void);
 void GPUDumpRegisters(void);
 void GPUDumpMemory(void);
@@ -234,7 +250,7 @@ static void gpu_opcode_sat24(void);
 static void gpu_opcode_pack(void);
 
 // This is wrong, since it doesn't take pipeline effects into account. !!! FIX !!!
-/*uint8 gpu_opcode_cycles[64] = 
+/*uint8 gpu_opcode_cycles[64] =
 {
        3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,
@@ -249,7 +265,7 @@ static void gpu_opcode_pack(void);
 //This is wrong, wrong, WRONG, but it seems to work for the time being...
 //(That is, it fixes Flip Out which relies on GPU timing rather than semaphores. Bad developers! Bad!)
 //What's needed here is a way to take pipeline effects into account (including pipeline stalls!)...
-/*uint8 gpu_opcode_cycles[64] = 
+/*uint8 gpu_opcode_cycles[64] =
 {
        1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,
@@ -260,7 +276,7 @@ static void gpu_opcode_pack(void);
        1,  1,  1,  1,  1,  1,  4,  1,
        1,  1,  3,  3,  1,  1,  1,  1
 };//*/
-uint8 gpu_opcode_cycles[64] = 
+uint8 gpu_opcode_cycles[64] =
 {
        1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,
@@ -272,8 +288,8 @@ uint8 gpu_opcode_cycles[64] =
        1,  1,  1,  1,  1,  1,  1,  1
 };//*/
 
-void (*gpu_opcode[64])()= 
-{      
+void (*gpu_opcode[64])()=
+{
        gpu_opcode_add,                                 gpu_opcode_addc,                                gpu_opcode_addq,                                gpu_opcode_addqt,
        gpu_opcode_sub,                                 gpu_opcode_subc,                                gpu_opcode_subq,                                gpu_opcode_subqt,
        gpu_opcode_neg,                                 gpu_opcode_and,                                 gpu_opcode_or,                                  gpu_opcode_xor,
@@ -330,7 +346,7 @@ static uint32 gpu_opcode_second_parameter;
 
 #define RESET_FLAG_Z() gpu_flag_z = 0;
 #define RESET_FLAG_N() gpu_flag_n = 0;
-#define RESET_FLAG_C() gpu_flag_c = 0;    
+#define RESET_FLAG_C() gpu_flag_c = 0;
 
 #define CLR_Z                          (gpu_flag_z = 0)
 #define CLR_ZN                         (gpu_flag_z = gpu_flag_n = 0)
@@ -351,8 +367,8 @@ uint8 * branch_condition_table = 0;
 
 uint32 gpu_opcode_use[64];
 
-const char * gpu_opcode_str[64]= 
-{      
+const char * gpu_opcode_str[64]=
+{
        "add",                          "addc",                         "addq",                         "addqt",
        "sub",                          "subc",                         "subq",                         "subqt",
        "neg",                          "and",                          "or",                           "xor",
@@ -374,12 +390,12 @@ const char * gpu_opcode_str[64]=
 static uint32 gpu_in_exec = 0;
 static uint32 gpu_releaseTimeSlice_flag = 0;
 
-void gpu_releaseTimeslice(void)
+void GPUReleaseTimeslice(void)
 {
        gpu_releaseTimeSlice_flag = 1;
 }
 
-uint32 gpu_get_pc(void)
+uint32 GPUGetPC(void)
 {
        return gpu_pc;
 }
@@ -507,7 +523,7 @@ uint32 GPUReadLong(uint32 offset, uint32 who/*=UNKNOWN*/)
                        gpu_flag_n = (gpu_flag_n ? 1 : 0);
 
                        gpu_flags = (gpu_flags & 0xFFFFFFF8) | (gpu_flag_n << 2) | (gpu_flag_c << 1) | gpu_flag_z;
-                                       
+
                        return gpu_flags & 0xFFFFC1FF;
                case 0x04:
                        return gpu_matrix_control;
@@ -627,16 +643,19 @@ void GPUWriteWord(uint32 offset, uint16 data, uint32 who/*=UNKNOWN*/)
                        else
                                gpu_div_control = (gpu_div_control & 0x0000FFFF) | ((data & 0xFFFF) << 16);
                }
-               else 
+               else
                {
 //WriteLog("[GPU W16:%08X,%04X]", offset, data);
                        uint32 old_data = GPUReadLong(offset & 0xFFFFFFC, who);
+
                        if (offset & 0x02)
                                old_data = (old_data & 0xFFFF0000) | (data & 0xFFFF);
                        else
                                old_data = (old_data & 0x0000FFFF) | ((data & 0xFFFF) << 16);
+
                        GPUWriteLong(offset & 0xFFFFFFC, old_data, who);
                }
+
                return;
        }
        else if ((offset == GPU_WORK_RAM_BASE + 0x0FFF) || (GPU_CONTROL_RAM_BASE + 0x1F))
@@ -684,7 +703,9 @@ void GPUWriteLong(uint32 offset, uint32 data, uint32 who/*=UNKNOWN*/)
                case 0x00:
                {
                        bool IMASKCleared = (gpu_flags & IMASK) && !(data & IMASK);
-                       gpu_flags = data;
+                       // NOTE: According to the JTRM, writing a 1 to IMASK has no effect; only the
+                       //       IRQ logic can set it. So we mask it out here to prevent problems...
+                       gpu_flags = data & (~IMASK);
                        gpu_flag_z = gpu_flags & ZERO_FLAG;
                        gpu_flag_c = (gpu_flags & CARRY_FLAG) >> 1;
                        gpu_flag_n = (gpu_flags & NEGA_FLAG) >> 2;
@@ -719,7 +740,7 @@ WriteLog("GPU: %s setting GPU PC to %08X %s\n", whoName[who], gpu_pc, (GPU_RUNNI
 #endif // GPU_DEBUG
                        break;
                case 0x14:
-               {       
+               {
 //                     uint32 gpu_was_running = GPU_RUNNING;
                        data &= ~0xF7C0;                // Disable writes to INT_LAT0-4 & TOM version number
 
@@ -727,13 +748,14 @@ WriteLog("GPU: %s setting GPU PC to %08X %s\n", whoName[who], gpu_pc, (GPU_RUNNI
                        if (data & 0x02)
                        {
 //WriteLog("GPU->CPU interrupt\n");
-                               if (tom_irq_enabled(IRQ_GPU))
+                               if (TOMIRQEnabled(IRQ_GPU))
                                {
-                                       if ((tom_irq_enabled(IRQ_GPU)) && (jaguar_interrupt_handler_is_valid(64)))
+//This is the programmer's responsibility, to make sure the handler is valid, not ours!
+//                                     if ((TOMIRQEnabled(IRQ_GPU))// && (JaguarInterruptHandlerIsValid(64)))
                                        {
-                                               tom_set_pending_gpu_int();
-                                               m68k_set_irq(7);                        // Set 68000 NMI
-                                               gpu_releaseTimeslice();
+                                               TOMSetPendingGPUInt();
+                                               m68k_set_irq(2);                        // Set 68000 IPL 2
+                                               GPUReleaseTimeslice();
                                        }
                                }
                                data &= ~0x02;
@@ -745,7 +767,7 @@ WriteLog("GPU: %s setting GPU PC to %08X %s\n", whoName[who], gpu_pc, (GPU_RUNNI
 //WriteLog("CPU->GPU interrupt\n");
                                GPUSetIRQLine(0, ASSERT_LINE);
                                m68k_end_timeslice();
-                               dsp_releaseTimeslice();
+                               DSPReleaseTimeslice();
                                data &= ~0x04;
                        }
 
@@ -763,13 +785,13 @@ WriteLog("GPU: %s setting GPU PC to %08X %s\n", whoName[who], gpu_pc, (GPU_RUNNI
                        {
                                WriteLog("GPU: Write32--About to do stupid braindead GPU execution for 200 cycles.\n");
 #endif // GPU_DEBUG
-                               gpu_exec(200);
+                               GPUExec(200);
 #ifdef GPU_DEBUG
                        }
 #endif // GPU_DEBUG//*/
 #else
                        if (gpu_control & 0x18)
-                               gpu_exec(1);
+                               GPUExec(1);
 #endif // #ifndef GPU_SINGLE_STEPPING
 #ifdef GPU_DEBUG
 WriteLog("Write to GPU CTRL by %s: %08X ", whoName[who], data);
@@ -821,7 +843,7 @@ if (GPU_RUNNING && effect_start5 && gpu_pc == 0xF035D8)
                for(int x=0; x<2; x++)
                {
                        JaguarWriteLong(dst, JaguarReadLong(src));
-                       
+
                        src += 4;
                        dst += 4;
                }
@@ -918,12 +940,12 @@ void GPUHandleIRQs(void)
 
        // Get the interrupt latch & enable bits
        uint32 bits = (gpu_control >> 6) & 0x1F, mask = (gpu_flags >> 4) & 0x1F;
-       
+
        // Bail out if latched interrupts aren't enabled
        bits &= mask;
        if (!bits)
                return;
-       
+
        // Determine which interrupt to service
        uint32 which = 0; //Isn't there a #pragma to disable this warning???
        if (bits & 0x01)
@@ -940,18 +962,18 @@ void GPUHandleIRQs(void)
        if (start_logging)
                WriteLog("GPU: Generating IRQ #%i\n", which);
 
-       // set the interrupt flag 
+       // set the interrupt flag
        gpu_flags |= IMASK;
        GPUUpdateRegisterBanks();
 
-       // subqt  #4,r31                ; pre-decrement stack pointer 
-       // move  pc,r30                 ; address of interrupted code 
+       // subqt  #4,r31                ; pre-decrement stack pointer
+       // move  pc,r30                 ; address of interrupted code
        // store  r30,(r31)     ; store return address
        gpu_reg[31] -= 4;
        GPUWriteLong(gpu_reg[31], gpu_pc - 2, GPU);
-       
-       // movei  #service_address,r30  ; pointer to ISR entry 
-       // jump  (r30)                                  ; jump to ISR 
+
+       // movei  #service_address,r30  ; pointer to ISR entry
+       // jump  (r30)                                  ; jump to ISR
        // nop
        gpu_pc = gpu_reg[30] = GPU_WORK_RAM_BASE + (which * 0x10);
 }
@@ -975,7 +997,7 @@ void GPUSetIRQLine(int irqline, int state)
 //#include "gpu2.h"
 //#include "gpu3.h"
 
-void gpu_init(void)
+void GPUInit(void)
 {
 //     memory_malloc_secure((void **)&gpu_ram_8, 0x1000, "GPU work RAM");
 //     memory_malloc_secure((void **)&gpu_reg_bank_0, 32 * sizeof(int32), "GPU bank 0 regs");
@@ -983,14 +1005,14 @@ void gpu_init(void)
 
        build_branch_condition_table();
 
-       gpu_reset();
+       GPUReset();
 
 //TEMPORARY: Testing only!
 //     gpu2_init();
 //     gpu3_init();
 }
 
-void gpu_reset(void)
+void GPUReset(void)
 {
        // GPU registers (directly visible)
        gpu_flags                         = 0x00000000;
@@ -1016,15 +1038,15 @@ void gpu_reset(void)
        memset(gpu_ram_8, 0xFF, 0x1000);
        gpu_in_exec = 0;
 //not needed   GPUInterruptPending = false;
-       gpu_reset_stats();
+       GPUResetStats();
 }
 
-uint32 gpu_read_pc(void)
+uint32 GPUReadPC(void)
 {
        return gpu_pc;
 }
 
-void gpu_reset_stats(void)
+void GPUResetStats(void)
 {
        for(uint32 i=0; i<64; i++)
                gpu_opcode_use[i] = 0;
@@ -1076,11 +1098,11 @@ void GPUDumpMemory(void)
                        gpu_ram_8[i+1], gpu_ram_8[i+2], gpu_ram_8[i+3]);
 }
 
-void gpu_done(void)
-{ 
+void GPUDone(void)
+{
        WriteLog("GPU: Stopped at PC=%08X (GPU %s running)\n", (unsigned int)gpu_pc, GPU_RUNNING ? "was" : "wasn't");
 
-       // Get the interrupt latch & enable bits 
+       // Get the interrupt latch & enable bits
        uint8 bits = (gpu_control >> 6) & 0x1F, mask = (gpu_flags >> 4) & 0x1F;
        WriteLog("GPU: Latch bits = %02X, enable bits = %02X\n", bits, mask);
 
@@ -1106,7 +1128,7 @@ void gpu_done(void)
 static int testCount = 1;
 static int len = 0;
 static bool tripwire = false;
-void gpu_exec(int32 cycles)
+void GPUExec(int32 cycles)
 {
        if (!GPU_RUNNING)
                return;
@@ -1161,7 +1183,7 @@ if (gpu_ram_8[0x054] == 0x98 && gpu_ram_8[0x055] == 0x0A && gpu_ram_8[0x056] ==
 /*             gpu_flag_c = (gpu_flag_c ? 1 : 0);
                gpu_flag_z = (gpu_flag_z ? 1 : 0);
                gpu_flag_n = (gpu_flag_n ? 1 : 0);*/
-       
+
                uint16 opcode = GPUReadWord(gpu_pc, GPU);
                uint32 index = opcode >> 10;
                gpu_instruction = opcode;                               // Added for GPU #3...
@@ -1267,7 +1289,7 @@ WriteLog("GPU: [%08X] %s (RM=%08X, RN=%08X) -> ", gpu_pc, buffer, RM, RN);
 //             gpu3_opcode[index]();
 
 // BIOS hacking
-//GPU: [00F03548] jr      nz,00F03560 (0xd561) (RM=00F03114, RN=00000004) ->     --> JR: Branch taken. 
+//GPU: [00F03548] jr      nz,00F03560 (0xd561) (RM=00F03114, RN=00000004) ->     --> JR: Branch taken.
 /*static bool firstTime = true;
 if (gpu_pc == 0xF03548 && firstTime)
 {
@@ -1368,7 +1390,7 @@ const char * condition[32] =
 if (gpu_start_log)
        WriteLog("    --> JUMP: Branch taken.\n");
                uint32 delayed_pc = RM;
-               gpu_exec(1);
+               GPUExec(1);
                gpu_pc = delayed_pc;
 /*             uint16 opcode = GPUReadWord(gpu_pc, GPU);
                gpu_opcode_first_parameter = (opcode >> 5) & 0x1F;
@@ -1423,7 +1445,7 @@ if (gpu_start_log)
        WriteLog("    --> JR: Branch taken.\n");
                int32 offset = (IMM_1 & 0x10 ? 0xFFFFFFF0 | IMM_1 : IMM_1);             // Sign extend IMM_1
                int32 delayed_pc = gpu_pc + (offset * 2);
-               gpu_exec(1);
+               GPUExec(1);
                gpu_pc = delayed_pc;
 /*             uint16 opcode = GPUReadWord(gpu_pc, GPU);
                gpu_opcode_first_parameter = (opcode >> 5) & 0x1F;
@@ -1714,7 +1736,16 @@ static void gpu_opcode_store_r14_indexed(void)
        if (doGPUDis)
                WriteLog("%06X: STORE  R%02u, (R14+$%02X) [NCZ:%u%u%u, R%02u=%08X, R14+$%02X=%08X]\n", gpu_pc-2, IMM_2, gpu_convert_zero[IMM_1] << 2, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN, gpu_convert_zero[IMM_1] << 2, gpu_reg[14]+(gpu_convert_zero[IMM_1] << 2));
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[14] + (gpu_convert_zero[IMM_1] << 2);
+       
+       if (address >= 0xF03000 && address <= 0xF03FFF)
+               GPUWriteLong(address & 0xFFFFFFFC, RN, GPU);
+       else
+               GPUWriteLong(address, RN, GPU);
+#else
        GPUWriteLong(gpu_reg[14] + (gpu_convert_zero[IMM_1] << 2), RN, GPU);
+#endif
 }
 
 static void gpu_opcode_store_r15_indexed(void)
@@ -1723,7 +1754,16 @@ static void gpu_opcode_store_r15_indexed(void)
        if (doGPUDis)
                WriteLog("%06X: STORE  R%02u, (R15+$%02X) [NCZ:%u%u%u, R%02u=%08X, R15+$%02X=%08X]\n", gpu_pc-2, IMM_2, gpu_convert_zero[IMM_1] << 2, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN, gpu_convert_zero[IMM_1] << 2, gpu_reg[15]+(gpu_convert_zero[IMM_1] << 2));
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[15] + (gpu_convert_zero[IMM_1] << 2);
+
+       if (address >= 0xF03000 && address <= 0xF03FFF)
+               GPUWriteLong(address & 0xFFFFFFFC, RN, GPU);
+       else
+               GPUWriteLong(address, RN, GPU);
+#else
        GPUWriteLong(gpu_reg[15] + (gpu_convert_zero[IMM_1] << 2), RN, GPU);
+#endif
 }
 
 static void gpu_opcode_load_r14_ri(void)
@@ -1732,7 +1772,16 @@ static void gpu_opcode_load_r14_ri(void)
        if (doGPUDis)
                WriteLog("%06X: LOAD   (R14+R%02u), R%02u [NCZ:%u%u%u, R14+R%02u=%08X, R%02u=%08X] -> ", gpu_pc-2, IMM_1, IMM_2, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_1, RM+gpu_reg[14], IMM_2, RN);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[14] + RM;
+
+       if (address >= 0xF03000 && address <= 0xF03FFF)
+               RN = GPUReadLong(address & 0xFFFFFFFC, GPU);
+       else
+               RN = GPUReadLong(address, GPU);
+#else
        RN = GPUReadLong(gpu_reg[14] + RM, GPU);
+#endif
 #ifdef GPU_DIS_LOAD14R
        if (doGPUDis)
                WriteLog("[NCZ:%u%u%u, R%02u=%08X]\n", gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN);
@@ -1745,7 +1794,16 @@ static void gpu_opcode_load_r15_ri(void)
        if (doGPUDis)
                WriteLog("%06X: LOAD   (R15+R%02u), R%02u [NCZ:%u%u%u, R15+R%02u=%08X, R%02u=%08X] -> ", gpu_pc-2, IMM_1, IMM_2, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_1, RM+gpu_reg[15], IMM_2, RN);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[15] + RM;
+
+       if (address >= 0xF03000 && address <= 0xF03FFF)
+               RN = GPUReadLong(address & 0xFFFFFFFC, GPU);
+       else
+               RN = GPUReadLong(address, GPU);
+#else
        RN = GPUReadLong(gpu_reg[15] + RM, GPU);
+#endif
 #ifdef GPU_DIS_LOAD15R
        if (doGPUDis)
                WriteLog("[NCZ:%u%u%u, R%02u=%08X]\n", gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN);
@@ -1758,7 +1816,16 @@ static void gpu_opcode_store_r14_ri(void)
        if (doGPUDis)
                WriteLog("%06X: STORE  R%02u, (R14+R%02u) [NCZ:%u%u%u, R%02u=%08X, R14+R%02u=%08X]\n", gpu_pc-2, IMM_2, IMM_1, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN, IMM_1, RM+gpu_reg[14]);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[14] + RM;
+
+       if (address >= 0xF03000 && address <= 0xF03FFF)
+               GPUWriteLong(address & 0xFFFFFFFC, RN, GPU);
+       else
+               GPUWriteLong(address, RN, GPU);
+#else
        GPUWriteLong(gpu_reg[14] + RM, RN, GPU);
+#endif
 }
 
 static void gpu_opcode_store_r15_ri(void)
@@ -1767,7 +1834,16 @@ static void gpu_opcode_store_r15_ri(void)
        if (doGPUDis)
                WriteLog("%06X: STORE  R%02u, (R15+R%02u) [NCZ:%u%u%u, R%02u=%08X, R15+R%02u=%08X]\n", gpu_pc-2, IMM_2, IMM_1, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN, IMM_1, RM+gpu_reg[15]);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT_STORE
+       uint32 address = gpu_reg[15] + RM;
+
+       if (address >= 0xF03000 && address <= 0xF03FFF)
+               GPUWriteLong(address & 0xFFFFFFFC, RN, GPU);
+       else
+               GPUWriteLong(address, RN, GPU);
+#else
        GPUWriteLong(gpu_reg[15] + RM, RN, GPU);
+#endif
 }
 
 static void gpu_opcode_nop(void)
@@ -1817,10 +1893,17 @@ static void gpu_opcode_storew(void)
        if (doGPUDis)
                WriteLog("%06X: STOREW R%02u, (R%02u) [NCZ:%u%u%u, R%02u=%08X, R%02u=%08X]\n", gpu_pc-2, IMM_2, IMM_1, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN, IMM_1, RM);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+               GPUWriteLong(RM & 0xFFFFFFFE, RN & 0xFFFF, GPU);
+       else
+               JaguarWriteWord(RM, RN, GPU);
+#else
        if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
                GPUWriteLong(RM, RN & 0xFFFF, GPU);
        else
                JaguarWriteWord(RM, RN, GPU);
+#endif
 }
 
 static void gpu_opcode_store(void)
@@ -1829,13 +1912,33 @@ static void gpu_opcode_store(void)
        if (doGPUDis)
                WriteLog("%06X: STORE  R%02u, (R%02u) [NCZ:%u%u%u, R%02u=%08X, R%02u=%08X]\n", gpu_pc-2, IMM_2, IMM_1, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN, IMM_1, RM);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+               GPUWriteLong(RM & 0xFFFFFFFC, RN, GPU);
+       else
+               GPUWriteLong(RM, RN, GPU);
+#else
        GPUWriteLong(RM, RN, GPU);
+#endif
 }
 
 static void gpu_opcode_storep(void)
 {
+#ifdef GPU_CORRECT_ALIGNMENT
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+       {
+               GPUWriteLong((RM & 0xFFFFFFF8) + 0, gpu_hidata, GPU);
+               GPUWriteLong((RM & 0xFFFFFFF8) + 4, RN, GPU);
+       }
+       else
+       {
+               GPUWriteLong(RM + 0, gpu_hidata, GPU);
+               GPUWriteLong(RM + 4, RN, GPU);
+       }
+#else
        GPUWriteLong(RM + 0, gpu_hidata, GPU);
        GPUWriteLong(RM + 4, RN, GPU);
+#endif
 }
 
 static void gpu_opcode_loadb(void)
@@ -1860,23 +1963,60 @@ static void gpu_opcode_loadw(void)
        if (doGPUDis)
                WriteLog("%06X: LOADW  (R%02u), R%02u [NCZ:%u%u%u, R%02u=%08X, R%02u=%08X] -> ", gpu_pc-2, IMM_1, IMM_2, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_1, RM, IMM_2, RN);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+               RN = GPUReadLong(RM & 0xFFFFFFFE, GPU) & 0xFFFF;
+       else
+               RN = JaguarReadWord(RM, GPU);
+#else
        if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
                RN = GPUReadLong(RM, GPU) & 0xFFFF;
        else
                RN = JaguarReadWord(RM, GPU);
+#endif
 #ifdef GPU_DIS_LOADW
        if (doGPUDis)
                WriteLog("[NCZ:%u%u%u, R%02u=%08X]\n", gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN);
 #endif
 }
 
+// According to the docs, & "Do The Same", this address is long aligned...
+// So let's try it:
+// And it works!!! Need to fix all instances...
+// Also, Power Drive Rally seems to contradict the idea that only LOADs in
+// the $F03000-$F03FFF range are aligned...
+#warning "!!! Alignment issues, need to find definitive final word on this !!!"
+/*
+Preliminary testing on real hardware seems to confirm that something strange goes on
+with unaligned reads in main memory. When the address is off by 1, the result is the
+same as the long address with the top byte replaced by something. So if the read is
+from $401, and $400 has 12 34 56 78, the value read will be $nn345678, where nn is a currently unknown vlaue.
+When the address is off by 2, the result would be $nnnn5678, where nnnn is unknown.
+When the address is off by 3, the result would be $nnnnnn78, where nnnnnn is unknown.
+It may be that the "unknown" values come from the prefetch queue, but not sure how
+to test that. They seem to be stable, though, which would indicate such a mechanism.
+Sometimes, however, the off by 2 case returns $12345678!
+*/
 static void gpu_opcode_load(void)
 {
 #ifdef GPU_DIS_LOAD
        if (doGPUDis)
                WriteLog("%06X: LOAD   (R%02u), R%02u [NCZ:%u%u%u, R%02u=%08X, R%02u=%08X] -> ", gpu_pc-2, IMM_1, IMM_2, gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_1, RM, IMM_2, RN);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 mask[4] = { 0x00000000, 0xFF000000, 0xFFFF0000, 0xFFFFFF00 };
+//     if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+               RN = GPUReadLong(RM & 0xFFFFFFFC, GPU);
+//             RN = GPUReadLong(RM & 0x00FFFFFC, GPU);
+//     else
+//             RN = GPUReadLong(RM, GPU);
+       // Simulate garbage in unaligned reads...
+//seems that this behavior is different in GPU mem vs. main mem...
+//     if ((RM < 0xF03000) || (RM > 0xF0BFFF))
+//             RN |= mask[RM & 0x03];
+#else
        RN = GPUReadLong(RM, GPU);
+#endif
 #ifdef GPU_DIS_LOAD
        if (doGPUDis)
                WriteLog("[NCZ:%u%u%u, R%02u=%08X]\n", gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN);
@@ -1885,8 +2025,21 @@ static void gpu_opcode_load(void)
 
 static void gpu_opcode_loadp(void)
 {
+#ifdef GPU_CORRECT_ALIGNMENT
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+       {
+               gpu_hidata = GPUReadLong((RM & 0xFFFFFFF8) + 0, GPU);
+               RN                 = GPUReadLong((RM & 0xFFFFFFF8) + 4, GPU);
+       }
+       else
+       {
+               gpu_hidata = GPUReadLong(RM + 0, GPU);
+               RN                 = GPUReadLong(RM + 4, GPU);
+       }
+#else
        gpu_hidata = GPUReadLong(RM + 0, GPU);
        RN                 = GPUReadLong(RM + 4, GPU);
+#endif
 }
 
 static void gpu_opcode_load_r14_indexed(void)
@@ -1895,7 +2048,16 @@ static void gpu_opcode_load_r14_indexed(void)
        if (doGPUDis)
                WriteLog("%06X: LOAD   (R14+$%02X), R%02u [NCZ:%u%u%u, R14+$%02X=%08X, R%02u=%08X] -> ", gpu_pc-2, gpu_convert_zero[IMM_1] << 2, IMM_2, gpu_flag_n, gpu_flag_c, gpu_flag_z, gpu_convert_zero[IMM_1] << 2, gpu_reg[14]+(gpu_convert_zero[IMM_1] << 2), IMM_2, RN);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[14] + (gpu_convert_zero[IMM_1] << 2);
+
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+               RN = GPUReadLong(address & 0xFFFFFFFC, GPU);
+       else
+               RN = GPUReadLong(address, GPU);
+#else
        RN = GPUReadLong(gpu_reg[14] + (gpu_convert_zero[IMM_1] << 2), GPU);
+#endif
 #ifdef GPU_DIS_LOAD14I
        if (doGPUDis)
                WriteLog("[NCZ:%u%u%u, R%02u=%08X]\n", gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN);
@@ -1908,7 +2070,16 @@ static void gpu_opcode_load_r15_indexed(void)
        if (doGPUDis)
                WriteLog("%06X: LOAD   (R15+$%02X), R%02u [NCZ:%u%u%u, R15+$%02X=%08X, R%02u=%08X] -> ", gpu_pc-2, gpu_convert_zero[IMM_1] << 2, IMM_2, gpu_flag_n, gpu_flag_c, gpu_flag_z, gpu_convert_zero[IMM_1] << 2, gpu_reg[15]+(gpu_convert_zero[IMM_1] << 2), IMM_2, RN);
 #endif
+#ifdef GPU_CORRECT_ALIGNMENT
+       uint32 address = gpu_reg[15] + (gpu_convert_zero[IMM_1] << 2);
+
+       if ((RM >= 0xF03000) && (RM <= 0xF03FFF))
+               RN = GPUReadLong(address & 0xFFFFFFFC, GPU);
+       else
+               RN = GPUReadLong(address, GPU);
+#else
        RN = GPUReadLong(gpu_reg[15] + (gpu_convert_zero[IMM_1] << 2), GPU);
+#endif
 #ifdef GPU_DIS_LOAD15I
        if (doGPUDis)
                WriteLog("[NCZ:%u%u%u, R%02u=%08X]\n", gpu_flag_n, gpu_flag_c, gpu_flag_z, IMM_2, RN);
@@ -2103,7 +2274,7 @@ static void gpu_opcode_mmult(void)
        if (gpu_matrix_control & 0x10)                          // Column stepping
        {
                for(int i=0; i<count; i++)
-               { 
+               {
                        int16 a;
                        if (i & 0x01)
                                a = (int16)((gpu_alternate_reg[IMM_1 + (i >> 1)] >> 16) & 0xFFFF);
@@ -2420,3 +2591,13 @@ static void gpu_opcode_sh(void)
 //Temporary: Testing only!
 //#include "gpu2.cpp"
 //#include "gpu3.cpp"
+
+#else
+
+// New thread-safe GPU core
+
+int GPUCore(void * data)
+{
+}
+
+#endif