I already do that in both 68k and GPU code

but thanks for the advice.
here is my code for the test:
68k:
CODE
move.l #screen,A1_BASE
move.l #tileset,A2_BASE
move.w #1,d0 ; Y
swap d0
move.w #-16,d0 ; X
move.l d0,A1_STEP
move.l d0,A2_STEP
move.l #PIXEL16|XADDPHR|WID320|PITCH1,A1_FLAGS
move.l #PIXEL16|XADDPHR|WID64|PITCH1,A2_FLAGS
move.l #$00100010,d6 ; B_COUNT 16x16
move.l #LFU_REPLACE|SRCEN|UPDA1|UPDA2,d7; B_CMD !
move.w #$F000,BG
moveq #16-1,d1
Ylop: moveq #20-1,d0
Xlop:
; Blit Info Position
move.w d1,d2 ; Y
lsl.w #4,d2 ; *16
swap d2
move.w d0,d2 ; X
lsl.w #4,d2 ; *16
move.l d2,A1_PIXEL
moveq #32,d2 ; X: tile 1
move.l d2,A2_PIXEL
; Blit !!
move.l d6,B_COUNT
move.l d7,B_CMD
dbra d0,Xlop
dbra d1,Ylop
GPU:
CODE
movei #screen,r0
movei #A1_BASE,r1
store r0,(r1)
movei #tileset,r0
movei #A2_BASE,r1
store r0,(r1)
movei #$0001FFF0,r0; y+1 x-16
movei #A1_STEP,r1
store r0,(r1)
movei #A2_STEP,r1
store r0,(r1)
movei #PIXEL16|XADDPHR|WID320|PITCH1,r0
movei #A1_FLAGS,r1
store r0,(r1)
movei #PIXEL16|XADDPHR|WID64|PITCH1,r0
movei #A2_FLAGS,r1
store r0,(r1)
movei #$00100010,r0 ; B_COUNT 16x16
movei #B_COUNT,r1
movei #LFU_REPLACE|SRCEN|UPDA1|UPDA2,r2; B_CMD !
movei #B_CMD,r3
moveq #1,r11 ; WAIT BLITTER MASK
movei #A1_PIXEL,r6
movei #A2_PIXEL,r7
movei #32,r8 ; X: tile 1
movei #Ylop,r13
movei #$F000,r4
movei #BG,r5
storew r4,(r5)
movei #16,r4 ; Y
Ylop: movei #20,r5 ; X
Xlop:
.waitb: load (r3),r12 ; Wait for the blitter to complete !
and r11,r12
jr EQ,.waitb
nop
; Blit Info Position
move r4,r9 ; Y
shlq #4,r9 ; *16
rorq #16,r9 ; swap
move r5,r10 ; X
shlq #4,r10 ; *16
or r10,r9
store r9,(r6)
store r8,(r7) ; A2_PIXEL, X: tile 1
; Blit !!
store r0,(r1) ; B_COUNT
store r2,(r3) ; B_CMD
subq #1,r5
jr NE,Xlop
nop
subq #1,r4
jump NE,(r13)
nop