ROSE  0.9.6a
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
libxomp.h
Go to the documentation of this file.
1 /*
2  * A common layer for both gomp and omni runtime library
3  * Liao 1/20/2009
4  * */
5 #ifndef LIB_XOMP_H
6 #define LIB_XOMP_H
7 
8 // Fortran outlined function uses one parameter for each variable to be passed by reference
9 // We predefine a max number of parameters to be allowed here.
10 #define MAX_OUTLINED_FUNC_PARAMETER_COUNT 256
11 #ifdef __cplusplus
12 extern "C" {
13 #endif
14 
15 #include <stdio.h>
16 #include <stdbool.h>
17 #include <stdlib.h> // for abort()
18 #include <assert.h>
19 #include <sys/time.h>
20 
21 // return the current time stamp in a double floating point number
22 extern double xomp_time_stamp(void);
23 extern int env_region_instr_val; // save the environment variable value for instrumentation support
24 //e.g. export XOMP_REGION_INSTR=0|1
25 
26 //enum omp_rtl_enum {
27 // e_gomp,
28 // e_omni,
29 // e_last_rtl
30 //};
31 //
32 //extern omp_rtl_enum rtl_type;
33 
34 //Runtime library initialization routine
35 extern void XOMP_init (int argc, char ** argv);
36 extern void xomp_init (void);
37 
38 // Runtime library termination routine
39 extern void XOMP_terminate (int exitcode);
40 
41 // func: pointer to a function which will be run in parallel
42 // data: pointer to a data segment which will be used as the arguments of func
43 // ifClauseValue: set to if-clause-expression if if-clause exists, or default is 1.
44 // numThreadsSpecified: set to the expression of num_threads clause if the clause exists, or default is 0
45 // file_name:line_no the start source file info about this parallel region, used to pass source level info. to runtime
46 extern void XOMP_parallel_start (void (*func) (void *), void *data, unsigned ifClauseValue, unsigned numThreadsSpecified, char* file_name, int line_no);
47 extern void XOMP_parallel_end (char* file_name, int line_no);
48 
49 /* Initialize sections and return the next section id (starting from 0) to be executed by the current thread */
50 extern int XOMP_sections_init_next(int section_count);
51 
52 /* Return the next section id (starting from 0) to be executed by the current thread. Return value <0 means no sections left */
53 extern int XOMP_sections_next(void);
54 
55 /* Called after the current thread is told that all sections are executed. It synchronizes all threads also. */
56 extern void XOMP_sections_end(void);
57 
58 /* Called after the current thread is told that all sections are executed. It does not synchronizes all threads. */
59 extern void XOMP_sections_end_nowait(void);
60 
61 extern void XOMP_task (void (*) (void *), void *, void (*) (void *, void *),
62  long, long, bool, unsigned);
63 extern void XOMP_taskwait (void);
64 
65 // scheduler functions, union of runtime library functions
66 // empty body if not used by one
67 // scheduler initialization, only meaningful used for OMNI
68 
69 // Default loop scheduling, worksharing without any schedule clause, upper bounds are inclusive
70 // Kick in before all runtime libraries. We use the default loop scheduling from XOMP regardless the runtime chosen.
71 extern void XOMP_loop_default(int lower, int upper, int stride, long* n_lower, long* n_upper);
72 
74 // Non-op for gomp
75 extern void XOMP_loop_static_init(int lower, int upper, int stride, int chunk_size);
76 extern void XOMP_loop_dynamic_init(int lower, int upper, int stride, int chunk_size);
77 extern void XOMP_loop_guided_init(int lower, int upper, int stride, int chunk_size);
78 extern void XOMP_loop_runtime_init(int lower, int upper, int stride);
79 
80 // ordered case
81 extern void XOMP_loop_ordered_static_init(int lower, int upper, int stride, int chunk_size);
82 extern void XOMP_loop_ordered_dynamic_init(int lower, int upper, int stride, int chunk_size);
83 extern void XOMP_loop_ordered_guided_init(int lower, int upper, int stride, int chunk_size);
84 extern void XOMP_loop_ordered_runtime_init(int lower, int upper, int stride);
85 
86 
87 // if (start),
88 // mostly used because of gomp, omni will just call XOMP_loop_xxx_next();
89 // (long start, long end, long incr, long chunk_size,long *istart, long *iend)
90 // upper bounds are non-inclusive,
91 // bounds for inclusive loop control will need +/-1 , depending on incremental/decremental cases
92 extern bool XOMP_loop_static_start (long, long, long, long, long *, long *);
93 extern bool XOMP_loop_dynamic_start (long, long, long, long, long *, long *);
94 extern bool XOMP_loop_guided_start (long, long, long, long, long *, long *);
95 extern bool XOMP_loop_runtime_start (long, long, long, long *, long *);
96 
97 extern bool XOMP_loop_ordered_static_start (long, long, long, long, long *, long *);
98 extern bool XOMP_loop_ordered_dynamic_start (long, long, long, long, long *, long *);
99 extern bool XOMP_loop_ordered_guided_start (long, long, long, long, long *, long *);
100 extern bool XOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
101 
102 // next
103 extern bool XOMP_loop_static_next (long *, long *);
104 extern bool XOMP_loop_dynamic_next (long *, long *);
105 extern bool XOMP_loop_guided_next (long *, long *);
106 extern bool XOMP_loop_runtime_next (long *, long *);
107 
108 extern bool XOMP_loop_ordered_static_next (long *, long *);
109 extern bool XOMP_loop_ordered_dynamic_next (long *, long *);
110 extern bool XOMP_loop_ordered_guided_next (long *, long *);
111 extern bool XOMP_loop_ordered_runtime_next (long *, long *);
112 
113 //--------------end of loop functions
114 
115 extern void XOMP_barrier (void);
116 extern void XOMP_critical_start (void** data);
117 extern void XOMP_critical_end (void** data);
118 extern bool XOMP_single(void);
119 extern bool XOMP_master(void);
120 
121 extern void XOMP_atomic_start (void);
122 extern void XOMP_atomic_end (void);
123 
124 extern void XOMP_loop_end (void);
125 extern void XOMP_loop_end_nowait (void);
126  // --- end loop functions ---
127 // flush without variable list
128 extern void XOMP_flush_all (void);
129 // omp flush with variable list, flush one by one, given each's start address and size
130 extern void XOMP_flush_one (char * startAddress, int nbyte);
131 
132 
133 // omp ordered directive
134 extern void XOMP_ordered_start (void);
135 extern void XOMP_ordered_end (void);
136 
137 //--------------------- extensions to support OpenMP accelerator model experimental implementation------
138 // We only include
139 //--------------------- kernel launch ------------------
140 
141 // the max number of threads per thread block of the first available device
142 extern size_t xomp_get_maxThreadsPerBlock();
143 
144 //get the max number of 1D blocks for a given input length
145 extern size_t xomp_get_max1DBlock(size_t ss);
146 
147 // Get the max number threads for one dimension (x or y) of a 2D block
148 // Two factors are considered: the total number of threads within the 2D block must<= total threads per block
149 // x * y <= maxThreadsPerBlock 512 or 1024
150 // each dimension: the number of threads must <= maximum x/y-dimension
151 // x <= maxThreadsDim[0], 1024
152 // y <= maxThreadsDim[1], 1024
153 // maxThreadsDim[0] happens to be equal to maxThreadsDim[1] so we use a single function to calculate max segments for both dimensions
154 extern size_t xomp_get_max_threads_per_dimesion_2D ();
155 
156 // return the max number of segments for a dimension (either x or y) of a 2D block
157 extern size_t xomp_get_maxSegmentsPerDimensionOf2DBlock(size_t dimension_size);
158 
159 //------------------memory allocation/copy/free----------------------------------
160 //Allocate device memory and return the pointer
161 // This should be a better interface than cudaMalloc()
162 // since it mimics malloc() closely
163 /*
164 return a pointer to the allocated space
165  * upon successful completion with size not equal to 0
166 return a null pointer if
167  * size is 0
168  * failure due to any reason
169 */
170 extern void* xomp_deviceMalloc(size_t size);
171 
172 // A host version
173 extern void* xomp_hostMalloc(size_t size);
174 
175 //get the time stamp for now, up to microsecond resolution: 1e-6 , but maybe 1e-4 in practice
176 extern double xomp_time_stamp();
177 
178 
179 // memory copy from src to dest, return the pointer to dest. NULL pointer if anything is wrong
180 extern void * xomp_memcpyHostToDevice (void *dest, const void * src, size_t n_n);
181 extern void * xomp_memcpyDeviceToHost (void *dest, const void * src, size_t n_n);
182 // copy a dynamically allocated host source array to linear dest address on a GPU device. the dimension information of the source array
183 // is given by: int dimensions[dimension_size], with known element size.
184 // bytes_copied reports the total bytes copied by this function.
185 // Note: It cannot be used copy static arrays declared like type array[N][M] !!
186 extern void * xomp_memcpyDynamicHostToDevice (void *dest, const void * src, int * dimensions, size_t dimension_size, size_t element_size, size_t *bytes_copied);
187 
188 // copy linear src memory to dynamically allocated destination, with dimension information given by
189 // int dimensions[dimension_size]
190 // the source memory has total n continuous memory, with known size for each element
191 // the total bytes copied by this function is reported by bytes_copied
192 extern void * xomp_memcpyDynamicDeviceToHost (void *dest, int * dimensions, size_t dimension_size, const void * src, size_t element_size, size_t *bytes_copied);
193 
194 extern void * xomp_memcpyDeviceToDevice (void *dest, const void * src, size_t n_n);
195 extern void * xomp_memcpyHostToHost (void *dest, const void * src, size_t n_n); // same as memcpy??
196 
197 
198 // free the device memory pointed by a pointer, return false in case of failure, otherwise return true
199 extern bool xomp_freeDevice(void* devPtr);
200 // free the host memory pointed by a pointer, return false in case of failure, otherwise return true
201 extern bool xomp_freeHost(void* hostPtr);
202 
203 /* Allocation/Free functions for Host */
204 /* Allocate a multi-dimensional array
205  *
206  * Input parameters:
207  * int *dimensions: an integer array storing the size of each dimension
208  * size_t dimension_num: the number of dimensions
209  * size_t esize: the size of an array element
210  *
211  * return:
212  * the pointer to the allocated array
213  * */
214 extern void * xomp_mallocArray(int * dimensions, size_t dimension_num, size_t esize);
215 
216 extern void xomp_freeArrayPointer (void* array, int * dimensions, size_t dimension_num);
217 
218 
219 /* CUDA reduction support */
220 //------------ types for CUDA reduction support---------
221 // Reduction for regular OpenMP is supported by compiler translation. No runtime support is needed.
222 // For the accelerator model experimental implementation, we use a two-level reduction method:
223 // thread-block level within GPU + beyond-block level on CPU
224 
225 /* an internal union type to be flexible for all types associated with reduction operations
226  We don't really want to expose this to the compiler to simplify the compiler translation.
227 */
228 // We try to limit the numbers of runtime data types exposed to a compiler.
229 // A set of integers to represent reduction operations
230 #define XOMP_REDUCTION_PLUS 6
231 #define XOMP_REDUCTION_MINUS 7
232 #define XOMP_REDUCTION_MUL 8
233 #define XOMP_REDUCTION_BITAND 9 // &
234 #define XOMP_REDUCTION_BITOR 10 // |
235 #define XOMP_REDUCTION_BITXOR 11 // ^
236 #define XOMP_REDUCTION_LOGAND 12 // &&
237 #define XOMP_REDUCTION_LOGOR 13 // ||
238 
239 #if 0
240 // No linker support for device code. We have to put implementation of these device functions into the header
241 // TODO: wait until nvcc supports linker for device code.
242 //#define XOMP_INNER_BLOCK_REDUCTION_DECL(dtype) \
243 //__device__ void xomp_inner_block_reduction_##dtype(dtype local_value, dtype * grid_level_results, int reduction_op);
244 //
246 //XOMP_INNER_BLOCK_REDUCTION_DECL(int)
247 //XOMP_INNER_BLOCK_REDUCTION_DECL(float)
248 //XOMP_INNER_BLOCK_REDUCTION_DECL(double)
249 //
250 //#undef XOMP_INNER_BLOCK_REDUCTION_DECL
251 
252 #endif
253 
254 #define XOMP_BEYOND_BLOCK_REDUCTION_DECL(dtype) \
255  dtype xomp_beyond_block_reduction_##dtype(dtype * per_block_results, int numBlocks, int reduction_op);
256 
260 
261 #undef XOMP_BEYOND_BLOCK_REDUCTION_DECL
262 // Liao, 8/29/2013
263 // Support round-robin static scheduling of loop iterations running on GPUs (accelerator)
264 // Static even scheduling may cause each thread to touch too much data, which stress memory channel.
265 // NOT IN USE. We use compiler to generate the variables instead of using a runtime data structure.
267  int num; /* the thread number of this thread in team */
268  int num_thds; /* current running thread, referenced by children */
269  int in_parallel; /* current thread executes the region in parallel */
270 
271  /* used for schedule */
272  int loop_chunk_size; //************* this is the chunk size
273  int loop_end; //************* equivalent to upper limit, up
274  int loop_sched_index; //************* lb+chunk_size*tp->num (num is the thread number of this thread in team)
275  int loop_stride; //************* chunk_size * nthds /* used for static scheduling */
276 
277  /* for 'lastprivate' */
278  int is_last;
279 };
280 
281 #define XOMP_MAX_MAPPED_VARS 256 // for simplicity, we use preallocated memory for storing the mapped variable list
282 /* Test runtime support for nested device data environments */
283 /* Liao, May 2, 2013*/
284 /* A data structure to keep track of a mapped variable
285  * Right now we use memory address of the original variable and the size of the variable
286  * */
288 {
289  void * address; // original variable's address
290  int size;
291  void * dev_address; // the corresponding device variable's address
292  bool copyBack; // if this variable should be copied back to HOST
293 };
294 
296 extern void copy_mapped_variable (struct XOMP_mapped_variable* desc, struct XOMP_mapped_variable* src );
297 
298 /* A doubly linked list for tracking Device Data Environment (DDE) */
299 struct DDE_data {
300  // Do we need this at all? we can allocate/deallocate data without saving region ID
301  int Region_ID; // hash of the AST node? or just memory address of the AST node for now
302 
303 // array of the newly mapped variables
306  //struct XOMP_mapped_variable new_variables[XOMP_MAX_MAPPED_VARS];
307 
308 // array of inherited mapped variable from possible upper level DDEs
311  //struct XOMP_mapped_variable inherited_variables[XOMP_MAX_MAPPED_VARS];
312 
313  // link to its parent node
314  struct DDE_data* parent;
315  // link to its child node
316  struct DDE_data* child;
317 };
318 
319 // The head of the list of DDE data nodes
320 extern struct DDE_data* DDE_head; //TODO. We don't really need this head pointer, it is like a stack, access the end is enough
321 // The tail of the list
322 extern struct DDE_data* DDE_tail;
323 
324 // create a new DDE-data node and append it to the end of the tracking list
325 // copy all variables from its parent node to be into the set of inherited variable set.
326 //void XOMP_Device_Data_Environment_Enter();
327 extern void xomp_deviceDataEnvironmentEnter();
328 
329 // Check if an original variable is already mapped in enclosing data environment, return its device variable's address if yes.
330 // return NULL if not
331 //void* XOMP_Device_Data_Environment_Get_Inherited_Variable (void* original_variable_address, int size);
332 extern void* xomp_deviceDataEnvironmentGetInheritedVariable (void* original_variable_address, int size);
333 
335 //void XOMP_Device_Data_Environment_Add_Variable (void* var_addr, int var_size, void * dev_addr);
336 extern void xomp_deviceDataEnvironmentAddVariable (void* var_addr, int var_size, void * dev_addr, bool copy_back);
337 
338 // Exit current DDE: deallocate device memory, delete the DDE-data node from the end of the tracking list
339 //void XOMP_Device_Data_Environment_Exit();
340 extern void xomp_deviceDataEnvironmentExit();
341 
342 
343 #ifdef __cplusplus
344  }
345 #endif
346 
347 #endif /* LIB_XOMP_H */
348 
349 
350 
351