
   ADR      R13,Stack
   Print "<4>"
   BL       Init
   GcolBackground black
   Cls
   BL       Set_Draw_Bank
   Cls

   ;---------VFP Init code from Terje
   SWI      OS_EnterOS
   MRC      P15,0,R0,C1,C0,2
   ORR      R0,R0,#&F << 20
   MCR      P15,0,R0,C1,C0,2
   ISB      SY
   MOV      R0,#1 << 30
   VMSR     FPEXC,R0
   MSR      CPSR_c,#&10


   ;---------Calculate Mandelbrot Fractal in Single Precision

   SWI      OS_ReadMonotonicTime
   MOV      R9,R0

   VLDR         S10,ro_sp
   VLDR         S11,ru_sp
   LDR          R0,fractal_size
   VMOV         S12,R0
   VCVT.F32.U32 S12,S12
   VSUB.F32     S10,S10,S11
   VDIV.F32     S31,S10,S12        ;delta = (ro-ru)/fractal_size

   LDR R0,Screen_Start
   LDR R1,Hpixels       ;Screen_Width

   VLDR S7, iter_out_sp ;limit = 4.0
   LDR  R2, iter_max    ;maximum iterations

   LDR R11,fractal_size
   SUB R11,R11,#1
   .y_loop_sp
      LDR R10,fractal_size
      SUB R10,R10,#1

      VLDR         S1, io_sp
      VMOV         S11,R11
      VCVT.F32.U32 S11,S11
      VMUL.F32     S2, S11,S31
      VSUB.F32     S11,S1, S2      ;b = io - current_y * delta

      .x_loop_sp
      VLDR         S1, ru_sp
      VMOV         S10,R10
      VCVT.F32.U32 S10,S10
      VMUL.F32     S2, S10,S31
      VADD.F32     S10,S1, S2      ;a = ru + current_x * delta

      VMOV.F32     S0, S10         ;x = a
      VMOV.F32     S1, S11         ;y = b

      MOV  R12,#0
      .iteration_loop_sp
         ADD R12,R12,#4              ; increment iteration counter
         ; 1.Iteration
         VMUL.F32       S6, S0, S0   ; x*x
         VMUL.F32       S9, S1, S1   ; y*y
         VMUL.F32       S2, S0, S1   ; x*y
         VADD.F32       S14,S6, S9   ; x*x + y*y (save diverge state in S14)
         VSUB.F32       S13,S6, S9   ; x_new = x*x - y*y
         VADD.F32       S12,S2, S2   ; 2*x*y
         VADD.F32       S0, S13,S10  ; x_new = x_new + a
         VADD.F32       S1, S12,S11  ; y_new = y_new + b
         ; 2.Iteration
         VMUL.F32       S6, S0, S0   ; x*x
         VMUL.F32       S9, S1, S1   ; y*y
         VMUL.F32       S2, S0, S1   ; x*y
         VADD.F32       S15,S6, S9   ; x*x + y*y (save diverge state in S15)
         VSUB.F32       S13,S6, S9   ; x_new = x*x - y*y
         VADD.F32       S12,S2, S2   ; 2*x*y
         VADD.F32       S0, S13,S10  ; x_new = x_new + a
         VADD.F32       S1, S12,S11  ; y_new = y_new + b
         ; 3.Iteration
         VMUL.F32       S6, S0, S0   ; x*x
         VMUL.F32       S9, S1, S1   ; y*y
         VMUL.F32       S2, S0, S1   ; x*y
         VADD.F32       S16,S6, S9   ; x*x + y*y (save diverge state in S16)
         VSUB.F32       S13,S6, S9   ; x_new = x*x - y*y
         VADD.F32       S12,S2, S2   ; 2*x*y
         VADD.F32       S0, S13,S10  ; x_new = x_new + a
         VADD.F32       S1, S12,S11  ; y_new = y_new + b
         ; 4.Iteration
         VMUL.F32       S6, S0, S0   ; x*x
         VMUL.F32       S9, S1, S1   ; y*y
         VMUL.F32       S2, S0, S1   ; x*y
         VADD.F32       S4, S6, S9   ; x*x + y*y
         VSUB.F32       S13,S6, S9   ; x_new = x*x - y*y
         VADD.F32       S12,S2, S2   ; 2*x*y
         VADD.F32       S0, S13,S10  ; x_new = x_new + a
         VADD.F32       S1, S12,S11  ; y_new = y_new + b

         VCMP.F32       S4,S7
         VMRS           APSR_nzcv,FPSCR
         BGT            exit_sp

      CMP R12,R2
      BNE iteration_loop_sp
      B draw_pixel_sp

      .exit_sp
      VCMP.F32       S16,S7     ; check if diverge happend at 3.Iteration
      VMRS           APSR_nzcv,FPSCR
      SUBGT          R12,R12,#1 ; if yes, subtract one iteration
      VCMP.F32       S15,S7     ; check if diverge happend at 2.Iteration
      VMRS           APSR_nzcv,FPSCR
      SUBGT          R12,R12,#1 ; if yes, subtract one iteration
      VCMP.F32       S14,S7     ; check if diverge happend at 1.Iteration
      VMRS           APSR_nzcv,FPSCR
      SUBGT          R12,R12,#1 ; if yes, subtract one iteration

      .draw_pixel_sp

      ;save iteration_counter first
      LDR   R4,iteration_counter_sp
      ADD   R4,R4,R12
      STR   R4,iteration_counter_sp

      MLA   R4,R1,R11,R10
      MOV   R12,R12,LSL#11    ;some simple colour adjustments
      STR   R12,[R0,R4,LSL#2] ;screen address in 32bit mode = (y*Hpixels+y)*4

      SUBS R10,R10,#1
      BPL x_loop_sp
   SUBS R11,R11,#1
   BPL y_loop_sp

   SWI      OS_ReadMonotonicTime
   SUBS     R9,R0,R9
   STR      R9,timer_sp
   ;---------End Single Precision

   Cls

   ;---------Calculate Mandelbrot Fractal in Double Precision

   SWI      OS_ReadMonotonicTime
   MOV      R9,R0

   VLDR         D10,ro_dp
   VLDR         D11,ru_dp
   LDR          R0,fractal_size
   VMOV         S31,R0
   VCVT.F32.U32 S31,S31
   VCVT.F64.F32 D12,S31
   VSUB.F64     D10,D10,D11
   VDIV.F64     D31,D10,D12        ;delta = (ro-ru)/fractal_size

   LDR R0,Screen_Start
   LDR R1,Hpixels       ;Screen_Width

   VLDR D7, iter_out_dp ;limit = 4.0
   LDR  R2, iter_max    ;maximum iterations

   LDR R11,fractal_size
   SUB R11,R11,#1
   .y_loop_dp
      LDR R10,fractal_size
      SUB R10,R10,#1

      VLDR         D1, io_dp
      VMOV         S31,R11
      VCVT.F32.U32 S31,S31
      VCVT.F64.F32 D11,S31
      VMUL.F64     D2, D11,D31
      VSUB.F64     D11,D1, D2      ;b = io - current_y * delta

      .x_loop_dp
      VLDR         D1, ru_dp
      VMOV         S31,R10
      VCVT.F32.U32 S31,S31
      VCVT.F64.F32 D10,S31
      VMUL.F64     D2, D10,D31
      VADD.F64     D10,D1, D2      ;a = ru + current_x * delta

      VMOV.F64     D0, D10         ;x = a
      VMOV.F64     D1, D11         ;y = b

      MOV  R12,#0
      .iteration_loop_dp
         ADD R12,R12,#4              ; increment iteration counter
         ; 1.Iteration
         VMUL.F64       D6, D0, D0   ; x*x
         VMUL.F64       D9, D1, D1   ; y*y
         VMUL.F64       D2, D0, D1   ; x*y
         VADD.F64       D14,D6, D9   ; x*x + y*y (save diverge state in S14)
         VSUB.F64       D13,D6, D9   ; x_new = x*x - y*y
         VADD.F64       D12,D2, D2   ; 2*x*y
         VADD.F64       D0, D13,D10  ; x_new = x_new + a
         VADD.F64       D1, D12,D11  ; y_new = y_new + b
         ; 2.Iteration
         VMUL.F64       D6, D0, D0   ; x*x
         VMUL.F64       D9, D1, D1   ; y*y
         VMUL.F64       D2, D0, D1   ; x*y
         VADD.F64       D15,D6, D9   ; x*x + y*y (save diverge state in S15)
         VSUB.F64       D13,D6, D9   ; x_new = x*x - y*y
         VADD.F64       D12,D2, D2   ; 2*x*y
         VADD.F64       D0, D13,D10  ; x_new = x_new + a
         VADD.F64       D1, D12,D11  ; y_new = y_new + b
         ; 3.Iiteration
         VMUL.F64       D6, D0, D0   ; x*x
         VMUL.F64       D9, D1, D1   ; y*y
         VMUL.F64       D2, D0, D1   ; x*y
         VADD.F64       D16,D6, D9   ; x*x + y*y (save diverge state in S16)
         VSUB.F64       D13,D6, D9   ; x_new = x*x - y*y
         VADD.F64       D12,D2, D2   ; 2*x*y
         VADD.F64       D0, D13,D10  ; x_new = x_new + a
         VADD.F64       D1, D12,D11  ; y_new = y_new + b
         ; 4.Iteration
         VMUL.F64       D6, D0, D0   ; x*x
         VMUL.F64       D9, D1, D1   ; y*y
         VMUL.F64       D2, D0, D1   ; x*y
         VADD.F64       D4, D6, D9   ; x*x + y*y
         VSUB.F64       D13,D6, D9   ; x_new = x*x - y*y
         VADD.F64       D12,D2, D2   ; 2*x*y
         VADD.F64       D0, D13,D10  ; x_new = x_new + a
         VADD.F64       D1, D12,D11  ; y_new = y_new + b

         VCMP.F64       D4,D7
         VMRS           APSR_nzcv,FPSCR
         BGT            exit_dp

      CMP R12,R2
      BNE iteration_loop_dp
      B draw_pixel_dp

      .exit_dp
      VCMP.F64       D16,D7     ; check if diverge happend at 3.Iteration
      VMRS           APSR_nzcv,FPSCR
      SUBGT          R12,R12,#1 ; if yes, subtract one iteration
      VCMP.F64       D15,D7     ; check if diverge happend at 2.Iteration
      VMRS           APSR_nzcv,FPSCR
      SUBGT          R12,R12,#1 ; if yes, subtract one iteration
      VCMP.F64       D14,D7     ; check if diverge happend at 1.Iteration
      VMRS           APSR_nzcv,FPSCR
      SUBGT          R12,R12,#1 ; if yes, subtract one iteration

      .draw_pixel_dp

      ;save iteration_counter first
      LDR   R4,iteration_counter_dp
      ADD   R4,R4,R12
      STR   R4,iteration_counter_dp

      MLA   R4,R1,R11,R10
      MOV   R12,R12,LSL#11    ;some simple colour adjustments
      STR   R12,[R0,R4,LSL#2] ;screen address in 32bit mode = (y*Hpixels+y)*4

      SUBS R10,R10,#1
      BPL x_loop_dp
   SUBS R11,R11,#1
   BPL y_loop_dp

   SWI      OS_ReadMonotonicTime
   SUBS     R9,R0,R9
   STR      R9,timer_dp
   ;---------End Double Precision

   B display_results

   ;---------Data_Mandelbrot_Fractal_and_other_stuff_placed_here for alignment
   .ro_sp             DCFS     -0.1450
   .ru_sp             DCFS     -0.1750
   .io_sp             DCFS     -1.02
   .iter_out_sp       DCFS      4.0

   .ro_dp             DCFD     -0.1450
   .ru_dp             DCFD     -0.1750
   .io_dp             DCFD     -1.02
   .iter_out_dp       DCFD      4.0

   .result_multiplier DCFS      0.1 ;/1000 iterations * 100 ms = * 0.1

   .iter_max             DCD   4096
   .fractal_size         DCD    600
   .iteration_counter_sp DCD      0
   .iteration_counter_dp DCD      0
   .timer_sp             DCD      0
   .timer_dp             DCD      0

   ;---------Display Results
   .display_results

   VLDR         S2,result_multiplier
   Home

   Print "Result Single Precision VFP             ":NL
   Print "----------------------------------------":NL
   LDR      R9,timer_sp
   Print1 "TIME taken [ms]              = ",R9:NL
   LDR          R0,iteration_counter_sp
   Print1 "Iterations calculated        = ",R0:NL
   VMOV         S0,R0
   VMOV         S1,R9
   VCVT.F32.U32 S0,S0
   VCVT.F32.U32 S1,S1
   VDIV.F32     S0,S0,S1
   VMUL.F32     S0,S0,S2
   VCVT.U32.F32 S0,S0
   VMOV         R0,S0
   Print1 "Speed in [1000 Iterations/s] = ",R0:NL:NL

   Print "Result Double Precision VFP             ":NL
   Print "----------------------------------------":NL
   LDR      R9,timer_dp
   Print1 "TIME taken [ms]              = ",R9:NL
   LDR          R0,iteration_counter_dp
   Print1 "Iterations calculated        = ",R0:NL
   VMOV         S0,R0
   VMOV         S1,R9
   VCVT.F32.U32 S0,S0
   VCVT.F32.U32 S1,S1
   VDIV.F32     S0,S0,S1
   VMUL.F32     S0,S0,S2
   VCVT.U32.F32 S0,S0
   VMOV         R0,S0
   Print1 "Speed in [1000 Iterations/s] = ",R0

.Check_Escape_Loop
   LDR      R0,Escape_Flag
   CMP      R0,#1
   BNE      Check_Escape_Loop
   BL       Shut_Down
   SWI      OS_Exit

