We know the speed of each region follows this table (from anomie):
Code: Select all
00-3f:0000-1fff= 8
00-3f:2000-3fff= 6
00-3f:4000-41ff= 12
00-3f:4200-5fff= 6
00-3f:6000-ffff= 8
40-7f:0000-ffff= 8
80-bf:0000-1fff= 8
80-bf:2000-3fff= 6
80-bf:4000-41ff= 12
80-bf:4200-5fff= 6
80-bf:6000-7fff= 8
80-bf:8000-ffff=6,8
c0-ff:0000-ffff=6,8
Code: Select all
inline uint speed(uint addr) { return speedlut[addr >> 9]; }
Result:
Code: Select all
mov ebx,addr ; shr ebx,9 ; mov edx,[speedlut] ; mov al,[ebx+edx]
anomie's method:
Code: Select all
uint speed(uint addr) {
if((addr & 0xc00000) == 0x400000)return 8;
if((addr & 0x808000) == 0x808000)return fast;
if((addr & 0xc00000) == 0xc00000)return fast;
if((addr & 0xe000) == 0x2000)return 6;
if((addr & 0xfe00) == 0x4000)return 12;
if((addr & 0xe000) == 0x4000)return 6;
return 8;
}
Code: Select all
uint speed(uint addr) {
addr |= ((addr & 0x8000) << 7);
if ((addr & 0xc00000) == 0xc00000) return fast;
addr |= ((addr - 0x2000) & 0x4000) << 8;
if ((addr & 0xc00000) == 0x400000) return 8;
if ((addr & 0x7e00) != 0x4000) return 6;
return 12;
}
fast=(fastrom)?6:8;
Quite a bit more code and lots of conditional branches to ruin branch prediction.
Now, what I'm wondering is... is this worth trying to come up with more complex bit arithmetic tricks to get rid of the LUT, or is it just simply faster to use the two 32kb lookup tables?
For what it's worth, access patterns are below for typical ROMs:
Code: Select all
00-3f:0000-1fff=common
00-3f:2000-3fff=common
00-3f:4000-41ff=very uncommon
00-3f:4200-5fff=common
00-3f:6000-7fff=uncommon
00-3f:8000-ffff=common
40-7f:0000-ffff=uncommon
80-bf:0000-1fff=common
80-bf:2000-3fff=common
80-bf:4000-41ff=very uncommon
80-bf:4200-5fff=common
80-bf:6000-7fff=uncommon
80-bf:8000-ffff=very common
c0-ff:0000-ffff=very common