Compaq KAP Fortran/OpenMP
for Tru64 UNIX
User Guide


Previous Contents Index


Appendix C
OpenMP Examples

For your convenience, the following examples have been adapted from the ANSI X3H5 Parallel Extensions for FORTRAN document.

C.1 DO: A Simple Difference Operator

This example shows a simple parallel loop where the amount of work in each iteration is different. We used dynamic scheduling to get good load balancing. The end do has a nowait because there is an implicit barrier at the end parallel . Alternately, using the option -optimize=1 would have also eliminated the barrier .


        subroutine do_1 (a,b,n) 
        real a(n,n), b(n,n) 
 
!$omp parallel 
!$omp&   shared(a,b,n) 
!$omp&   private(i,j) 
!$omp do schedule(dynamic,1) 
        do i = 2, n 
            do j = 1, i 
               b(j,i) = ( a(j,i) + a(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.2 DO: Two Difference Operators

Shows two parallel regions fused to reduce fork/join overhead. The first end do has a nowait because all the data used in the second do is different than all the data used in the first do .


        subroutine do_2 (a,b,c,d,m,n) 
        real a(n,n), b(n,n), c(m,m), d(m,m) 
 
!$omp parallel 
!$omp&   shared(a,b,c,d,m,n) 
!$omp&   private(i,j) 
!$omp do schedule(dynamic,1) 
        do i = 2, n 
            do j = 1, i 
                b(j,i) = ( a(j,i) + a(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp end do nowait 
!$omp do schedule(dynamic,1) 
        do i = 2, m 
            do j = 1, i 
                d(j,i) = ( c(j,i) + c(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.3 DO: Reduce Fork/Join Overhead

Routines do_3a and do_3b perform numerically equivalent computations, but because the parallel directive in routine do_3b is outside the do j loop, routine do_3b probably forms teams less often, and thus reduces overhead.


        subroutine do_3a (a,b,m,n) 
        real a(n,m), b(n,m) 
 
        do j = 2, m 
!$omp parallel 
!$omp&   shared(a,b,n,j) 
!$omp&   private(i) 
!$omp do 
            do i = 1, n 
                a(i,j) = b(i,j) / a(i,j-1) 
            enddo 
!$omp end do nowait 
!$omp end parallel 
        enddo 
        end 
 
        subroutine do_3b (a,b,m,n) 
        real a(n,m), b(n,m) 
 
!$omp parallel 
!$omp&   shared(a,b,m,n) 
!$omp&   private(i,j) 
        do j = 2, m 
!$omp do 
            do i = 1, n 
                a(i,j) = b(i,j) / a(i,j-1) 
            enddo 
!$omp end do nowait 
        enddo 
!$omp end parallel 
        end 

C.4 SECTIONS: Two Difference Operators

This example is identical to Section C.2 but uses sections instead of do . Here the speedup is limited to 2 because there are only two units of work, whereas in Section C.2 there are n-1 + m-1 units of work.


        subroutine sections_1 (a,b,c,d,m,n) 
        real a(n,n), b(n,n), c(m,m), d(m,m) 
 
!$omp parallel 
!$omp&   shared(a,b,c,d,m,n) 
!$omp&   private(i,j) 
!$omp sections 
!$omp section 
        do i = 2, n 
            do j = 1, i 
                b(j,i) = ( a(j,i) + a(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp section 
        do i = 2, m 
            do j = 1, i 
                d(j,i) = ( c(j,i) + c(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp end sections nowait 
!$omp end parallel 
        end 

C.5 SINGLE: Updating a Shared Scalar

This example demonstrates how to use a single construct to update an element of the shared array a . The optional end do nowait after the first do is omitted because we need to wait at the end of the do before proceeding into the single .


        subroutine sp_1a (a,b,n) 
        real a(n), b(n) 
 
!$omp parallel 
!$omp&   shared(a,b,n) 
!$omp&   private(i) 
!$omp do 
        do i = 1, n 
            a(i) = 1.0 / a(i) 
        enddo 
!$omp single 
        a(1) = min( a(1), 1.0 ) 
!$omp end single 
!$omp do 
        do i = 1, n 
            b(i) = b(i) / a(i) 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.6 SECTIONS: Updating a Shared Scalar

This example is identical to Section C.5 but uses different directives.


        subroutine sections_sp_1 (a,b,n) 
        real a(n), b(n) 
 
!$omp parallel 
!$omp&   shared(a,b,n) 
!$omp&   private(i) 
!$omp do 
        do i = 1, n 
            a(i) = 1.0 / a(i) 
        enddo 
!$omp sections 
        a(1) = min( a(1), 1.0 ) 
!$omp end sections 
!$omp do 
        do i = 1, n 
            b(i) = b(i) / a(i) 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.7 DO: Updating a Shared Scalar

This example is identical to Section C.5 but uses different directives.


        subroutine do_sp_1 (a,b,n) 
        real a(n), b(n) 
 
!$omp parallel 
!$omp&   shared(a,b,n) 
!$omp&   private(i) 
!$omp do 
        do i = 1, n 
            a(i) = 1.0 / a(i) 
        enddo 
!$omp end do 
!$omp do 
        do i = 1, 1 
            a(1) = min( a(1), 1.0 ) 
        enddo 
!$omp end do 
!$omp do 
        do i = 1, n 
            b(i) = b(i) / a(i) 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.8 PARALLEL DO: A Simple Difference Operator

This example is identical to Section C.1 but uses different directives.


        subroutine paralleldo_1 (a,b,n) 
        real a(n,n), b(n,n) 
 
!$omp parallel do 
!$omp&   shared(a,b,n) 
!$omp&   private(i,j) 
!$omp&   schedule(dynamic,1) 
        do i = 2, n 
            do j = 1, i 
                b(j,i) = ( a(j,i) + a(j,i-1) ) / 2 
            enddo 
        enddo 
        end 

C.9 PARALLEL SECTIONS: Two Difference Operators

This example is identical to Section C.4 but uses different directives. The maximum performance improvement is limited to the number of sections run in parallel, so this example has a maximum parallelism of 2.


        subroutine sections_2 (a,b,c,d,m,n) 
        real a(n,n), b(n,n), c(m,m), d(m,m) 
 
!$omp parallel sections 
!$omp&   shared(a,b,c,d,m,n) 
!$omp&   private(i,j) 
!$omp section 
        do i = 2, n 
            do j = 1, i 
               b(j,i) = ( a(j,i) + a(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp section 
        do i = 2, m 
            do j = 1, i 
               d(j,i) = ( c(j,i) + c(j,i-1) ) / 2 
            enddo 
        enddo 
!$omp end parallel sections 
        end 

C.10 Simple Reduction

This example demonstrates how to perform a reduction using partial sums while avoiding synchronization in the loop body.


        subroutine reduction_1 (a,m,n,sum) 
        real a(m,n) 
 
!$omp parallel 
!$omp&   shared(a,m,n,sum) 
!$omp&   private(i,j,local_sum) 
        local_sum = 0.0 
!$omp do 
        do i = 1, n 
            do j = 1, m 
                local_sum = local_sum + a(j,i) 
            enddo 
        enddo 
!$omp end do nowait 
!$omp critical 
        sum = sum + local_sum 
!$omp end critical 
!$omp end parallel 
        end 

The above reduction could also use the REDUCTION () clause as follows:


        subroutine reduction_2 (a,m,n,sum) 
        real a(m,n) 
 
!$omp parallel do 
!$omp&   shared(a,m,n) 
!$omp&   private(i,j) 
!$omp&   reduction(+:sum) 
        do i = 1, n 
            do j = 1, m 
                local_sum = local_sum + a(j,i) 
            enddo 
        enddo 
        end 

C.11 TASKCOMMON: Private Common

This example demonstrates the use of taskcommon privatizable common blocks.


        subroutine tc_1 (n) 
        common /shared/ a 
        real a(100,100) 
        common /private/ work 
        real work(10000) 
!$omp threadprivate (/private/)  ! this privatizes the 
                                 ! common /private/ 
!$omp parallel 
!$omp&   shared(n) 
!$omp&   private(i) 
!$omp do 
        do i = 1, n 
            call construct_data() ! fills in array work() 
            call use_data()       ! uses array work() 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.12 THREADPRIVATE: Private Common and Master Thread

In this example, the value 2 is printed because the master thread's copy of a variable in a threadprivate privatizable common block is accessed within a master section or in serial code sections. If a single was used in place of the master section, some single thread, but not necessarily the master thread, would set j to 2 and the printed result would be indeterminate.


        subroutine tc_2 
        common /blk/ j 
!$omp threadprivate (/blk/) 
 
        j = 1 
!$omp parallel 
!$omp master 
        j = 2 
!$omp end master 
!$omp end parallel 
 
        print *, j 
        end 

C.13 INSTANCE PARALLEL: As a Private Common

This example demonstrates the use of instance parallel privatizable common blocks.


        subroutine ip_1 (n) 
        common /shared/ a 
        real a(100,100) 
        common /private/ work 
        real work(10000) 
!$omp instance parallel (/private/) 
 
!$omp parallel 
!$omp&   shared(n) 
!$omp&   private(i) 
!$omp new (/private/)            ! this privatizes the 
!$omp do                         ! common /private/ 
        do i = 1, n 
            call construct_data()! fills in array work() 
            call use_data()      ! uses array work() 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.14 INSTANCE PARALLEL: As a Shared and then a Private Common

This example demonstrates the use of an instance parallel common block first as a shared common block and then as a private common block. This would not be possible with taskcommon blocks because taskcommon blocks are always private.


        subroutine ip_2 (n,m) 
        common /shared/ a,b 
        real a(100,100), b(100,100) 
        common /private/ work 
        real work(10000) 
!$omp instance parallel (/private/) 
 
!$omp parallel                    ! common /private/ is 
!$omp&   shared(a,b,n)            ! shared here since 
!$omp&   private(i)               ! no new appears 
!$omp do 
        do i = 1, n 
            work(i) = b(i,i) / 4.0 
        enddo 
!$omp end do nowait 
!$omp end parallel 
 
        do i = 1, n 
            do j = 1, m 
                a(j,i) = work(i) * ( a(j-1,i) + a(j+1,i) 
     x                   + a(j,i-1) + a(j,i+1) ) 
            enddo 
        enddo 
 
!$omp parallel 
!$omp&   shared(m) 
!$omp&   private(i) 
!$omp new (/private/)             ! this privatizes the 
!$omp do                          ! common /private/ 
        do i = 1, m 
            call construct_data() ! fills in array work() 
            call use_data()       ! uses array work() 
        enddo 
!$omp end do nowait 
!$omp end parallel 
        end 

C.15 Avoiding External Routines: Reduction

This example demonstrates two coding styles for reductions, one using the external routines omp_get_max_threads() and omp_get_thread_num() and the other using only OpenMP directives.


        subroutine reduction_3a (n) 
        real gx( 0:7 )   ! assume 8 processors 
 
        do i = 0, omp_get_max_threads()-1 
            gx(i) = 0 
        enddo 
 
!$omp parallel 
!$omp&   shared(a) 
!$omp&   private(i,lx) 
        lx = 0 
!$omp do 
        do i = 1, n 
            lx = lx + a(i) 
        enddo 
!$omp end do nowait 
        gx( omp_get_thread_num() ) = lx 
!$omp end parallel 
 
        x = 0 
        do i = 0, omp_get_max_threads()-1 
            x = x + gx(i) 
        enddo 
 
        print *, x 
        end 

As is shown below, this example could have been written without the external routines:


        subroutine reduction_3b (n) 
 
        x = 0 
!$omp parallel 
!$omp&   shared(a,x) 
!$omp&   private(i,lx) 
        lx = 0 
!$omp do 
        do i = 1, n 
            lx = lx + a(i) 
        enddo 
!$omp end do nowait 
!$omp critical 
        x = x + lx 
!$omp end critical 
!$omp end parallel 
 
        print *, x 
        end 

This example could have also been written more simply using the reduction() clause as follows:


        subroutine reduction_3c (n) 
 
        x = 0 
!$omp parallel 
!$omp&   shared(a) 
!$omp&   private(i) 
!$omp do reduction(+:x) 
        do i = 1, n 
            x = x + a(i) 
        enddo 
!$omp end do nowait 
!$omp end parallel 
 
        print *, x 
        end 

C.16 Avoiding External Routines: Temporary Storage

This example demonstrates three coding styles for temporary storage, one using the external routine omp_get_thread_num() and the other two using only directives.


     subroutine local_1a (n) 
        dimension a(100) 
        common /cmn/ t( 100, 0:7 )  ! assume 8 processors 
!    max. 
!$omp parallel do 
!$omp&   shared(a,t) 
!$omp&   private(i) 
        do i = 1, n 
            do j = 1, n 
                t(j, omp_get_thread_num()) = a(i) ** 2 
            enddo 
            call work( t(1,omp_get_thread_num()) ) 
        enddo 
        end 

If t is not global, then the above could be accomplished by putting t in the private clause:


        subroutine local_1b (n) 
        dimension t(100) 
 
!$omp parallel do 
!$omp&   shared(a) 
!$omp&   private(i,t) 
        do i = 1, n 
            do j = 1, n 
                t(j) = a(i) ** 2 
            enddo 
            call work( t ) 
        enddo 
        end 

If t is global, then the instance parallel and new directives can be used instead:


        subroutine local_1c (n) 
        dimension t(100) 
        common /cmn/ t 
!$omp instance parallel (/cmn/) 
 
!$omp parallel do 
!$omp&   shared(a) 
!$omp&   private(i) 
!$omp new (/cmn/) 
        do i = 1, n 
            do j = 1, n 
                t(j) = a(i) ** 2 
            enddo 
            call work   ! access t from common /cmn/ 
        enddo 
        end 

C.17 FIRSTPRIVATE: Copying in Initialization Values

Not all of the values of a and b are initialized in the loop before they are used (the rest of the values are produced by init_a and init_b ). Using firstprivate for a and b causes the initialization values produced by init_a and init_b to be copied into private copies of a and b for use in the loops:


        subroutine dsq3_b (c,n) 
        integer n 
        real a(100), b(100), c(n,n), x, y 
        call init_a( a, n ) 
        call init_b( b, n ) 
!$omp parallel do shared(c,n) private(i,j,x,y) firstprivate(a,b) 
        do i = 1, n 
            do j = 1, i 
                a(j) = calc_a(i) 
                b(j) = calc_b(i) 
            enddo 
            do j = 1, n 
                x = a(i) - b(i) 
                y = b(i) + a(i) 
                c(j,i) = x * y 
            enddo 
        enddo 
!$omp end parallel do 
        print *, x, y 
        end 

C.18 THREADPRIVATE: Copying in Initialization Values

This example is similar to Section C.17 except it uses threadprivate common blocks. For threadprivate , copyin is used instead of firstprivate to copy initialization values from the shared (master) copy of /blk/ to the private copies:


        subroutine dsq3_b_tc (c,n) 
        integer n 
        real a(100), b(100), c(n,n), x, y 
        common /blk/ a,b 
!$omp threadprivate (/blk/) 
 
        call init_a( a, n ) 
        call init_b( b, n ) 
!$omp parallel do shared(c,n) private(i,j,x,y) copyin(a,b) 
        do i = 1, n 
            do j = 1, i 
                a(j) = calc_a(i) 
                b(j) = calc_b(i) 
            enddo 
            do j = 1, n 
                x = a(i) - b(i) 
                y = b(i) + a(i) 
                c(j,i) = x * y 
            enddo 
        enddo 
!$omp end parallel do 
        print *, x, y 
        end 

C.19 INSTANCE PARALLEL: Copying in Initialization Values

This example is similar to Section C.17 except is uses instance parallel privatizable common blocks. For instance parallel , copy new is used instead of firstprivate to privatize the common block and to copy initialization values from the shared (master) copy of /blk/ to the private copies:


        subroutine dsq3_b_ip (c,n) 
        integer n 
        real a(100), b(100), c(n,n), x, y 
        common /blk/ a,b 
!$omp instance parallel (/blk/) 
 
        call init_a( a, n ) 
        call init_b( b, n ) 
!$omp parallel do shared(c,n) private(i,j,x,y) 
!$omp copy new (/blk/) 
        do i = 1, n 
            do j = 1, i 
                a(j) = calc_a(i) 
                b(j) = calc_b(i) 
            enddo 
            do j = 1, n 
                x = a(i) - b(i) 
                y = b(i) + a(i) 
                c(j,i) = x * y 
            enddo 
        enddo 
!$omp end parallel do 
        print *, x, y 
        end 


Previous Next Contents Index