;
;FracNEON by Michael Kbel
;
;List of Registers used...so I don't become to confused ;-)
;
;VFP Register Usage
;------------------
;Q0 :F32: x (Mandelbrot data for 4 pixels)
;Q1 :F32: y (Mandelbrot data for 4 pixels)
;Q2 :U32: iteration counter (4 counters, each 32 bits)
;Q3 :U32: diverge comparison Value (and auxiliary calculation value)
;Q4 :F32: a (Mandelbrot data constant = x offset)
;Q5 :F32: b (Mandelbrot data constant = x offset)
;Q6 :F32: x*x + y*y (bailout value for 1.Iteration, saving for later check)
;Q7 :F32: x*x + y*y (bailout value for 2.Iteration, saving for later check)
;Q8 :F32: auxiliary calculation value
;Q9 :F32: auxiliary calculation value
;Q10:F32: x*x + y*y (bailout value for 3.Iteration, saving for later check)
;Q11:F32: x*x + y*y (bailout value for 4.Iteration, saving for later check)
;Q12:F32: auxiliary calculation value
;Q13:F32: used for intermediate storage of some results
;Q14:U32: iteration sub counter constant for each pixel (4x=4, end of line=1)
;Q15:F32: Mandelbrot bailout check constant = 4.0
;
;ARM Register Usage
;------------------
;R0 : auxiliary value
;R1 : auxiliary value
;R2 : auxiliary value
;R3 : auxiliary value
;R4 : global iteration counter
;R5 : screen start of line position (to save instructions and memory access)
;R6 : screen position of 1.iterating pixel = x1
;R7 : screen position of 2.iterating pixel = x2
;R8 : screen position of 3.iterating pixel = x3
;R9 : screen position of 4.iterating pixel = x4
;R10: screen line positon = x
;R11: end of line flag (first 4 bits represent the 4 iterated pixels)
;R12: maximum iterations - 1 constant
;

   ADR      R13,Stack
   Print "<4>"
   BL       Init
   GcolBackground black
   Cls
   BL       Set_Draw_Bank
   Cls

   ;---------VFP Init code from Terje
   SWI      OS_EnterOS
   MRC      P15,0,R0,C1,C0,2
   ORR      R0,R0,#&F << 20
   MCR      P15,0,R0,C1,C0,2
   ISB      SY
   MOV      R0,#1 << 30
   VMSR     FPEXC,R0
   MSR      CPSR_c,#&10


   ;---------Calculate Mandelbrot Fractal in Single Precision

   SWI OS_ReadMonotonicTime
   STR R0,timer         ;save timer

   MOV R0,#fractal_size
   VLDR         S0,ro_sp
   VLDR         S1,ru_sp
   VMOV         S2,R0
   VCVT.F32.U32 S2,S2
   VSUB.F32     S0,S0,S1
   VDIV.F32     S0,S0,S2
   VSTR         S0,delta_sp       ;delta = (ro-ru)/fractal_size

   VLDR         S0,iter_out_sp
   VDUP.32      Q15,D0[0]         ;limit = 4.0 for each pixel

   LDR R12,iter_max
   SUB R12,R12,#1        ;init maximum iterations - 1

   MOV R4,#0             ;init global iteration counter

   MOV R0,#0
   STR R0,y_counter      ;init y_counter

   .y_loop_sp

      LDR R2,y_counter

      VLDR         S0,io_sp
      VLDR         S1,delta_sp
      VMOV         S2,R2
      VCVT.F32.U32 S2,S2
      VMUL.F32     S2,S2,S1
      VSUB.F32     S0,S0,S2     ;b0 = io - current_y * delta
      VSTR         S0,io_save_sp;save value for easy access later
      VDUP.32      Q5,D0[0]     ;save b0,b1,b2,b3 in Q5

      MOV R6 ,#0                ;starting line counter pixel 1
      MOV R7, #1                ;starting line counter pixel 2
      MOV R8, #2                ;starting line counter pixel 3
      MOV R9, #3                ;starting line counter pixel 4
      MOV R10,R9                ;init line counter global

      MOV          R0,#0        ;Init all 4 first pixel of a line
      VLDR         S4,ru_sp
      VLDR         S5,delta_sp
      VDUP.32      Q0,D2[0]     ;Q0 =  ru        ru        ru         ru
      VDUP.32      Q1,D2[1]     ;Q1 =  d         d         d          d
      VMOV         S8, R0
      ADD          R0, R0,#1
      VMOV         S9, R0
      ADD          R0, R0,#1
      VMOV         S10,R0
      ADD          R0, R0,#1
      VMOV         S11,R0
      VCVT.F32.U32 Q2,Q2        ;Q2 =   x        x+1        x+2         x+3
      VMUL.F32     Q1,Q1,Q2     ;Q1 =  d*x     d*(x+1)    d*(x+2)     d*(x+3)
      VADD.F32     Q0,Q0,Q1     ;Q0 = ru+d*x  ru+d*(x+1) ru+d*(x+2)  ru+d*(x+3)
                                ;x  = a

      VMOV         Q4,Q0        ;save a0,a1,a2,a3 in Q10
      VMOV         Q1,Q5        ;y  = b

      VMOV.I32 Q14,#4           ;init sub value = 4 for each pixel
      VDUP.I32 Q2, R12          ;init single iteration counters = max_iter-1

      LDR R0,Screen_Start       ;Screen Start address
      LDR R1,Hpixels            ;Screen Width
      MOV R1,R1,LSL#2           ;4 Bytes per Pixel
      MLA R5,R1,R2,R0           ;Line position = Screen_Start + HPixels * y

      MOV R11,#0                ;init end of line flags for each pixel

      .iteration_loop
         ;1.Iteration
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q10,Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b

         ;2.Iteration
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q11,Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b

         ;3.Iteration
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q6, Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b

         ;4.Iteration
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q7, Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VCGT.F32       Q3, Q7, Q15  ;> 4 ?
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b

         VSUB.U32       Q2, Q2, Q14  ;decrement pixel counters

         VORR           D26,D6, D7   ;check if Q3>4 without changing Q3
         VORR           D27,D4, D5   ;check if Q2<0 without changing Q2
         VMOV           R0, R1, D26
         VMOV           R2, R3, D27
         ORRS           R0, R0, R1
         BMI            diverged_1234    ;1 or more pixel diverged
         ORRS           R2, R2, R3
      BPL            iteration_loop
      BMI            max_iterations_1234 ;1 or more pixel reached max iter

      ;############ Pixel 1 Diverge Handling ############

      .diverged_1234
      VMOV  R0,S12               ;check if pixel 1 diverged (Q3.1 -> S12)
      TST   R0,#&ffffffff
      BEQ   diverged_234         ;not ? -> check Pixel 2,3,4

         VMOV     Q13,Q3         ;save 4th iteration compare results
         VMOV     R0,S8          ;get iterations pixel 1 (Q2.1 -> S8)
         VCGT.F32 Q3, Q6, Q15    ;> 4 ? Diverged at 3rd ?
         VMOV     R1,S12
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q11,Q15    ;> 4 ? Diverged at 2nd ?
         VMOV     R1,S12
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q10,Q15    ;> 4 ? Diverged at 1st ?
         VMOV     R1,S12
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VMOV     Q3,Q13         ;restore 4th iteration compare results

         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R6,LSL#2]      ;write pixel on screen
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%0001     ;set end of line flags for pixel 1
            BGE diverged_234         ;but still check if also another diverged

         VMOV          S8,R12        ;transfer counter init to Q2.1 -> S8
         MOV           R6,R10        ;save new x in new plot_x1
         VLDR          S0,delta_sp   ;use S0 as not needed anymore
         VMOV          S4,R10        ;use S4 for R10, as not needed anymore
         VCVT.F32.U32  S4,S4
         VMUL.F32      S0,S0,S4      ;new x1 = x1*d
         VLDR          S4,ru_sp      ;use S4 as not needed anymore
         VADD.F32      S0,S0,S4      ;new x1 = ru + x1*d
         VMOV.F32      S16,S0        ;new x1 also = new 'a'
         VLDR          S4,io_save_sp ;get saved y4=y3=y2=y1

         ;just continue checking pixel 2

      ;############ Pixel 1 Diverge Handling ############

      .diverged_234
      VMOV R0,S13                ;check if pixel 2 diverged (Q3.2 -> S13)
      TST  R0,#&ffffffff
      BEQ  diverged_34           ;not ? -> check Pixel 3,4

         VMOV     Q13,Q3         ;save 4th iteration compare results
         VMOV     R0,S9          ;get iterations pixel 2 (Q2.2 -> S9)
         VCGT.F32 Q3, Q6, Q15    ;> 4 ? Diverged at 3rd ?
         VMOV     R1,S13
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q11,Q15    ;> 4 ? Diverged at 2nd ?
         VMOV     R1,S13
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q10,Q15    ;> 4 ? Diverged at 1st ?
         VMOV     R1,S13
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VMOV     Q3,Q13         ;restore 4th iteration compare results

         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4,R2              ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R7,LSL#2]      ;write pixel on screen
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%0010     ;set end of line flags for pixel 2
            BGE diverged_34          ;but still check if also another diverged

         VMOV          S9,R12        ;transfer to Q2.2 -> S9
         MOV           R7,R10        ;save new x in new plot_x2
         VLDR          S1,delta_sp   ;use S1 as not needed anymore
         VMOV          S5,R10        ;use S5 for R10, as not needed anymore
         VCVT.F32.U32  S5,S5
         VMUL.F32      S1,S1,S5      ;new x2 = x2*d
         VLDR          S5,ru_sp      ;use S5 as not needed anymore
         VADD.F32      S1,S1,S5      ;new x2 = ru + x2*d
         VMOV.F32      S17,S1        ;new x2 also = new 'a'
         VLDR          S5,io_save_sp ;get saved y4=y3=y2=y1

         ;just continue checking pixel 3

      ;############ Pixel 3 Diverge Handling ############

      .diverged_34
      VMOV R0,S14                ;check if pixel 3 diverged (Q3.3 -> S14)
      TST  R0,#&ffffffff
      BEQ  diverged_4            ;not ? -> check Pixel 4

         VMOV     Q13,Q3         ;save 4th iteration compare results
         VMOV     R0,S10         ;get iterations pixel 3 (Q2.3 -> S10)
         VCGT.F32 Q3, Q6, Q15    ;> 4 ? Diverged at 3rd ?
         VMOV     R1,S14
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q11,Q15    ;> 4 ? Diverged at 2nd ?
         VMOV     R1,S14
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q10,Q15    ;> 4 ? Diverged at 1st ?
         VMOV     R1,S14
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VMOV     Q3,Q13         ;restore 4th iteration compare results

         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R8,LSL#2]      ;write pixel on screen
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%0100     ;set end of line flags for pixel 3
            BGE diverged_4           ;but still check if also another diverged

         VMOV          S10,R12       ;transfer to Q2.3 -> S10
         MOV           R8, R10       ;save new x in new plot_x3
         VLDR          S2,delta_sp   ;use S2 as not needed anymore
         VMOV          S6,R10        ;use S6 for R10, as not needed anymore
         VCVT.F32.U32  S6,S6
         VMUL.F32      S2,S2,S6      ;new x3 = x3*d
         VLDR          S6,ru_sp      ;use S6 as not needed anymore
         VADD.F32      S2,S2,S6      ;new x3 = ru + x3*d
         VMOV.F32      S18,S2        ;new x3 also = new 'a'
         VLDR          S6,io_save_sp ;get saved y4=y3=y2=y1

         ;just continue checking pixel 4

      B diverged_4

      ;############ Data Mandelbrot Fractal placed here for offset limit reason
      ;placed here because of alignment
      .ro_sp             DCFS     -0.1450
      .ru_sp             DCFS     -0.1750
      .io_sp             DCFS     -1.02
      .delta_sp          DCFS      0.0     ;dummy_value
      .io_save_sp        DCFS      0.0     ;dummy_value
      .iter_out_sp       DCFS      4.0

      ;############ Pixel 4 Diverge Handling ############

      .diverged_4
      VMOV  R0,S15               ;check if pixel 4 diverged (Q3.4 -> S15)
      TST   R0,#&ffffffff
      BEQ   end_diverge_check    ;not ? -> continue

         VMOV     Q13,Q3         ;save 4th iteration compare results
         VMOV     R0,S11         ;get iterations pixel 1 (Q2.4 -> S11)
         VCGT.F32 Q3, Q6, Q15    ;> 4 ? Diverged at 3rd ?
         VMOV     R1,S15
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q11,Q15    ;> 4 ? Diverged at 2nd ?
         VMOV     R1,S15
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VCGT.F32 Q3, Q10,Q15    ;> 4 ? Diverged at 1st ?
         VMOV     R1,S15
         TST      R1,#&ffffffff
         ADDNE    R0,R0,#1
         VMOV     Q3,Q13         ;restore 4th iteration compare results

         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R9,LSL#2]      ;write pixel on screen
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%1000     ;set end of line flags for pixel 4
            BGE end_diverge_check    ;but still check also end of iterations

         VMOV          S11,R12       ;transfer to Q2.4 -> S11
         MOV           R9, R10       ;save new x in new plot_x4
         VLDR          S3,delta_sp   ;use S3 as not needed anymore
         VMOV          S7,R10        ;use S7 for R10, as not needed anymore
         VCVT.F32.U32  S7,S7
         VMUL.F32      S3,S3,S7      ;new x4 = x4*d
         VLDR          S7,ru_sp      ;use S7 as not needed anymore
         VADD.F32      S3,S3,S7      ;new x4 = ru + x4*d
         VMOV.F32      S19,S3        ;new x4 also = new 'a'
         VLDR          S7,io_save_sp ;get saved y4=y3=y2=y1

      .end_diverge_check

      VORR D27,D4, D5           ;check if Q2<0 without changing Q2
      VMOV R2, R3, D27
      ORRS R2, R2, R3
      BMI max_iterations_1234   ;1 or more pixel reached also max iter

      CMP R11,#0
      BEQ iteration_loop        ;if not, was end of line reached ?

      ;############ Pixel 1 Max Iterations Handling ############

      .max_iterations_1234
      VMOV  R0,S8               ;check if pixel 1 reach max iter (Q2.1 -> S8)
      TST   R0,#&80000000       ;check sign bit
      BEQ max_iterations_234    ;not ? -> check Pixel 2,3,4

         ADD   R4,R4,R12
         ADD   R4,R4,#1              ;add to global iteration counter
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%0001     ;set end of line flags for pixel 1
            BGE max_iterations_234   ;but still check also others

         VMOV          S8,R12        ;transfer to Q2.1 -> S8
         MOV           R6,R10        ;save new x in new plot_x1
         VLDR          S0,delta_sp   ;use S0 as not needed anymore
         VMOV          S4,R10        ;use S4 for R10, as not needed anymore
         VCVT.F32.U32  S4,S4
         VMUL.F32      S0,S0,S4      ;new x1 = x1*d
         VLDR          S4,ru_sp      ;use S4 as not needed anymore
         VADD.F32      S0,S0,S4      ;new x1 = ru + x1*d
         VMOV.F32      S16,S0        ;new x1 also = new 'a'
         VLDR          S4,io_save_sp ;get saved y4=y3=y2=y1

      ;############ Pixel 2 Max Iterations Handling ############

      .max_iterations_234
      VMOV  R0,S9               ;check if pixel 2 reach max iter (Q2.2 -> S9)
      TST   R0,#&80000000       ;check sign bit
      BEQ   max_iterations_34   ;not ? -> check Pixel 3,4

         ADD   R4,R4,R12
         ADD   R4,R4,#1              ;add to global iteration counter
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%0010     ;set end of line flags for pixel 2
            BGE max_iterations_34    ;but still check also others

         VMOV          S9,R12        ;transfer to Q2.2 -> S9
         MOV           R7,R10        ;save new x in new plot_x2
         VLDR          S1,delta_sp   ;use S1 as not needed anymore
         VMOV          S5,R10        ;use S5 for R10, as not needed anymore
         VCVT.F32.U32  S5,S5
         VMUL.F32      S1,S1,S5      ;new x2 = x2*d
         VLDR          S5,ru_sp      ;use S5 as not needed anymore
         VADD.F32      S1,S1,S5      ;new x2 = ru + x2*d
         VMOV.F32      S17,S1        ;new x2 also = new 'a'
         VLDR          S5,io_save_sp ;get saved y4=y3=y2=y1

      ;############ Pixel 3 Max Iterations Handling ############

      .max_iterations_34
      VMOV  R0,S10              ;check if pixel 3 reach max iter (Q2.3 -> S10)
      TST   R0,#&80000000       ;check sign bit
      BEQ max_iterations_4      ;not ? -> check Pixel 4

         ADD   R4,R4,R12
         ADD   R4,R4,#1              ;add to global iteration counter
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%0100     ;set end of line flags for pixel 3
            BGE max_iterations_4     ;but still check also others

         VMOV          S10,R12       ;transfer to Q2.3 -> S10
         MOV           R8,R10        ;save new x in new plot_x3
         VLDR          S2,delta_sp   ;use S2 as not needed anymore
         VMOV          S6,R10        ;use S6 for R10, as not needed anymore
         VCVT.F32.U32  S6,S6
         VMUL.F32      S2,S2,S6      ;new x3 = x3*d
         VLDR          S6,ru_sp      ;use S6 as not needed anymore
         VADD.F32      S2,S2,S6      ;new x3 = ru + x3*d
         VMOV.F32      S18,S2        ;new x3 also = new 'a'
         VLDR          S6,io_save_sp ;get saved y4=y3=y2=y1

      ;############ Pixel 4 Max Iterations Handling ############

      .max_iterations_4
      VMOV  R0,S11              ;check if pixel 3 reach max iter (Q2.4 -> S11)
      TST   R0,#&80000000       ;check sign bit
      BEQ end_max_iter_check    ;not ? -> end max iter check

         ADD   R4,R4,R12
         ADD   R4,R4,#1              ;add to global iteration counter
         ADD   R10,R10,#1            ;increase x
         CMP   R10,#fractal_size     ;end of line reached ?

            ORRGE R11,R11,#%1000     ;set end of line flags for pixel 4
            BGE end_of_line          ;now all are checked for everything

         VMOV          S11,R12       ;transfer to Q2.4 -> S11
         MOV           R9,R10        ;save new x in new plot_x4
         VLDR          S3,delta_sp   ;use S3 as not needed anymore
         VMOV          S7,R10        ;use S7 for R10, as not needed anymore
         VCVT.F32.U32  S7,S7
         VMUL.F32      S3,S3,S7      ;new x4 = x4*d
         VLDR          S7,ru_sp      ;use S7 as not needed anymore
         VADD.F32      S3,S3,S7      ;new x4 = ru + x4*d
         VMOV.F32      S19,S3        ;new x4 also = new 'a'
         VLDR          S7,io_save_sp ;get saved y4=y3=y2=y1

      .end_max_iter_check

      CMP R11,#0
      BEQ iteration_loop

      .end_of_line

      VMOV.I32 Q14,#1 ;change sub value = 1 now for each pixel
      VMOV Q10,Q0     ;save state of Q0 for other pixels
      VMOV Q11,Q1     ;save state of Q1 for other pixels
      VMOV Q13,Q2     ;save state of pixel counter for other pixels

      ;############ End of Line Pixel 1 Handling  ############
      TST R11,#%0001
      BNE check_end_of_line_pixel_2

      .iterate_pixel_1
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q7, Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VCGT.F32       Q3, Q7, Q15  ;> 4 ?
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b
         VSUB.U32       Q2, Q2, Q14  ;decrement pixel counters

         VMOV           R0,S12       ;check if pixel 1 diverged
         VMOV           R1,S8        ;check if pixel 1 reached max iter
         MOVS           R0,R0
         BMI            end_pixel_1_diverged
         MOVS           R1,R1
         BPL            iterate_pixel_1

         ;maximum iterations reached
         ADD   R4,R4,R12             ;add to global iteration counter
         ADD   R4,R4,#1
         B check_end_of_line_pixel_2

      .end_pixel_1_diverged
         VMOV  R0,S8                 ;get iterations pixel 1 (Q2.1 -> S8)
         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R6,LSL#2]      ;write pixel on screen

      ;############ End of Line Pixel 2 Handling  ############
      .check_end_of_line_pixel_2

      VMOV Q0,Q10 ;restore state of Q0 for other pixels
      VMOV Q1,Q11 ;restore state of Q1 for other pixels
      VMOV Q2,Q13 ;restore state of pixel counter for other pixels

      TST R11,#%0010
      BNE check_end_of_line_pixel_3

      .iterate_pixel_2
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q7, Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VCGT.F32       Q3, Q7, Q15  ;> 4 ?
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b
         VSUB.U32       Q2, Q2, Q14  ;decrement pixel counters

         VMOV           R0,S13       ;check if pixel 2 diverged
         VMOV           R1,S9        ;check if pixel 2 reached max iter
         MOVS           R0,R0
         BMI            end_pixel_2_diverged
         MOVS           R1,R1
         BPL            iterate_pixel_2

         ;maximum iterations reached
         ADD   R4,R4,R12             ;add to global iteration counter
         ADD   R4,R4,#1
         B check_end_of_line_pixel_3

      .end_pixel_2_diverged
         VMOV  R0,S9                 ;get iterations pixel 2 (Q2.2 -> S9)
         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R7,LSL#2]      ;write pixel on screen

      ;############ End of Line Pixel 3 Handling  ############
      .check_end_of_line_pixel_3

      VMOV Q0,Q10 ;restore state of Q0 for other pixels
      VMOV Q1,Q11 ;restore state of Q1 for other pixels
      VMOV Q2,Q13 ;restore state of pixel counter for other pixels

      TST R11,#%0100
      BNE check_end_of_line_pixel_4

      .iterate_pixel_3
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q7, Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VCGT.F32       Q3, Q7, Q15  ;> 4 ?
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b
         VSUB.U32       Q2, Q2, Q14  ;decrement pixel counters

         VMOV           R0,S14       ;check if pixel 3 diverged
         VMOV           R1,S10       ;check if pixel 3 reached max iter
         MOVS           R0,R0
         BMI            end_pixel_3_diverged
         MOVS           R1,R1
         BPL            iterate_pixel_3

         ;maximum iterations reached
         ADD   R4,R4,R12             ;add to global iteration counter
         ADD   R4,R4,#1
         B check_end_of_line_pixel_4

      .end_pixel_3_diverged
         VMOV  R0,S10                ;get iterations pixel 3 (Q2.3 -> S10)
         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R8,LSL#2]      ;write pixel on screen

      ;############ End of Line Pixel 3 Handling  ############
      .check_end_of_line_pixel_4

      VMOV Q0,Q10 ;restore state of Q0 for other pixels
      VMOV Q1,Q11 ;restore state of Q1 for other pixels
      VMOV Q2,Q13 ;restore state of pixel counter for other pixels

      TST R11,#%1000
      BNE next_line

      .iterate_pixel_4
         VMUL.F32       Q3, Q0, Q0   ;x*x
         VMUL.F32       Q8, Q1, Q1   ;y*y
         VMUL.F32       Q12,Q0, Q1   ;x*y
         VADD.F32       Q7, Q3, Q8   ;x*x + y*y
         VSUB.F32       Q9, Q3, Q8   ;x_new = x*x - y*y
         VADD.F32       Q12,Q12,Q12  ;2*x*y
         VCGT.F32       Q3, Q7, Q15  ;> 4 ?
         VADD.F32       Q0, Q9 ,Q4   ;x_new = x_new + a
         VADD.F32       Q1, Q12,Q5   ;y_new = y_new + b
         VSUB.U32       Q2, Q2, Q14  ;decrement pixel counters

         VMOV           R0,S15       ;check if pixel 4 diverged
         VMOV           R1,S11       ;check if pixel 4 reached max iter
         MOVS           R0,R0
         BMI            end_pixel_4_diverged
         MOVS           R1,R1
         BPL            iterate_pixel_4

         ;maximum iterations reached
         ADD   R4,R4,R12             ;add to global iteration counter
         ADD   R4,R4,#1
         B next_line

      .end_pixel_4_diverged
         VMOV  R0,S11                ;get iterations pixel 4 (Q2.4 -> S11)
         SUB   R2,R12,R0             ;correct counter
         ADD   R4,R4, R2             ;add to global iteration counter
         MOV   R2,R2,LSL#11          ;some colour
         STR   R2,[R5,R9,LSL#2]      ;write pixel on screen

   .next_line
   LDR R0,y_counter
   ADD R0,R0,#1
   STR R0,y_counter
   CMP R0,#fractal_size

   BNE y_loop_sp

   STR R4,iteration_counter          ;save iteration counter in case R4 used

   SWI OS_ReadMonotonicTime
   LDR R1,timer
   SUB R0,R0,R1
   STR R0,timer

   ;---------End Single NEON Precision

   B display_results

   .result_multiplier        DCFS    0.1 ;/1000 iterations * 100 ms = * 0.1
   .iter_max                 DCD   4096
   .iteration_counter        DCD      0
   .timer                    DCD      0
   .y_counter                DCD      0

   ;---------Display Results
   .display_results

   VLDR         S2,result_multiplier
   Home

   Print "Result Single Precision NEON            ":NL
   Print "----------------------------------------":NL
   LDR          R9,timer
   Print1 "TIME taken [ms]              = ",R9:NL
   LDR          R0,iteration_counter
   Print1 "Iterations calculated        = ",R0:NL
   VMOV         S0,R0
   VMOV         S1,R9
   VCVT.F32.U32 S0,S0
   VCVT.F32.U32 S1,S1
   VDIV.F32     S0,S0,S1
   VMUL.F32     S0,S0,S2
   VCVT.U32.F32 S0,S0
   VMOV         R0,S0
   Print1 "Speed in [1000 Iterations/s] = ",R0:NL:NL

.Check_Escape_Loop
   LDR      R0,Escape_Flag
   CMP      R0,#1
   BNE      Check_Escape_Loop
   BL       Shut_Down
   SWI      OS_Exit

