ROSE
0.9.6a
Main Page
Related Pages
Modules
Namespaces
Classes
Files
Examples
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
libxomp.h
Go to the documentation of this file.
1
/*
2
* A common layer for both gomp and omni runtime library
3
* Liao 1/20/2009
4
* */
5
#ifndef LIB_XOMP_H
6
#define LIB_XOMP_H
7
8
// Fortran outlined function uses one parameter for each variable to be passed by reference
9
// We predefine a max number of parameters to be allowed here.
10
#define MAX_OUTLINED_FUNC_PARAMETER_COUNT 256
11
#ifdef __cplusplus
12
extern
"C"
{
13
#endif
14
15
#include <stdio.h>
16
#include <stdbool.h>
17
#include <stdlib.h>
// for abort()
18
#include <assert.h>
19
#include <sys/time.h>
20
21
// return the current time stamp in a double floating point number
22
extern
double
xomp_time_stamp
(
void
);
23
extern
int
env_region_instr_val
;
// save the environment variable value for instrumentation support
24
//e.g. export XOMP_REGION_INSTR=0|1
25
26
//enum omp_rtl_enum {
27
// e_gomp,
28
// e_omni,
29
// e_last_rtl
30
//};
31
//
32
//extern omp_rtl_enum rtl_type;
33
34
//Runtime library initialization routine
35
extern
void
XOMP_init
(
int
argc,
char
** argv);
36
extern
void
xomp_init
(
void
);
37
38
// Runtime library termination routine
39
extern
void
XOMP_terminate
(
int
exitcode);
40
41
// func: pointer to a function which will be run in parallel
42
// data: pointer to a data segment which will be used as the arguments of func
43
// ifClauseValue: set to if-clause-expression if if-clause exists, or default is 1.
44
// numThreadsSpecified: set to the expression of num_threads clause if the clause exists, or default is 0
45
// file_name:line_no the start source file info about this parallel region, used to pass source level info. to runtime
46
extern
void
XOMP_parallel_start
(
void
(*func) (
void
*),
void
*data,
unsigned
ifClauseValue,
unsigned
numThreadsSpecified,
char
* file_name,
int
line_no);
47
extern
void
XOMP_parallel_end
(
char
* file_name,
int
line_no);
48
49
/* Initialize sections and return the next section id (starting from 0) to be executed by the current thread */
50
extern
int
XOMP_sections_init_next
(
int
section_count);
51
52
/* Return the next section id (starting from 0) to be executed by the current thread. Return value <0 means no sections left */
53
extern
int
XOMP_sections_next
(
void
);
54
55
/* Called after the current thread is told that all sections are executed. It synchronizes all threads also. */
56
extern
void
XOMP_sections_end
(
void
);
57
58
/* Called after the current thread is told that all sections are executed. It does not synchronizes all threads. */
59
extern
void
XOMP_sections_end_nowait
(
void
);
60
61
extern
void
XOMP_task
(
void
(*) (
void
*),
void
*,
void
(*) (
void
*,
void
*),
62
long
,
long
,
bool
,
unsigned
);
63
extern
void
XOMP_taskwait
(
void
);
64
65
// scheduler functions, union of runtime library functions
66
// empty body if not used by one
67
// scheduler initialization, only meaningful used for OMNI
68
69
// Default loop scheduling, worksharing without any schedule clause, upper bounds are inclusive
70
// Kick in before all runtime libraries. We use the default loop scheduling from XOMP regardless the runtime chosen.
71
extern
void
XOMP_loop_default
(
int
lower,
int
upper,
int
stride,
long
* n_lower,
long
* n_upper);
72
74
// Non-op for gomp
75
extern
void
XOMP_loop_static_init
(
int
lower,
int
upper,
int
stride,
int
chunk_size);
76
extern
void
XOMP_loop_dynamic_init
(
int
lower,
int
upper,
int
stride,
int
chunk_size);
77
extern
void
XOMP_loop_guided_init
(
int
lower,
int
upper,
int
stride,
int
chunk_size);
78
extern
void
XOMP_loop_runtime_init
(
int
lower,
int
upper,
int
stride);
79
80
// ordered case
81
extern
void
XOMP_loop_ordered_static_init
(
int
lower,
int
upper,
int
stride,
int
chunk_size);
82
extern
void
XOMP_loop_ordered_dynamic_init
(
int
lower,
int
upper,
int
stride,
int
chunk_size);
83
extern
void
XOMP_loop_ordered_guided_init
(
int
lower,
int
upper,
int
stride,
int
chunk_size);
84
extern
void
XOMP_loop_ordered_runtime_init
(
int
lower,
int
upper,
int
stride);
85
86
87
// if (start),
88
// mostly used because of gomp, omni will just call XOMP_loop_xxx_next();
89
// (long start, long end, long incr, long chunk_size,long *istart, long *iend)
90
// upper bounds are non-inclusive,
91
// bounds for inclusive loop control will need +/-1 , depending on incremental/decremental cases
92
extern
bool
XOMP_loop_static_start
(
long
,
long
,
long
,
long
,
long
*,
long
*);
93
extern
bool
XOMP_loop_dynamic_start
(
long
,
long
,
long
,
long
,
long
*,
long
*);
94
extern
bool
XOMP_loop_guided_start
(
long
,
long
,
long
,
long
,
long
*,
long
*);
95
extern
bool
XOMP_loop_runtime_start
(
long
,
long
,
long
,
long
*,
long
*);
96
97
extern
bool
XOMP_loop_ordered_static_start
(
long
,
long
,
long
,
long
,
long
*,
long
*);
98
extern
bool
XOMP_loop_ordered_dynamic_start
(
long
,
long
,
long
,
long
,
long
*,
long
*);
99
extern
bool
XOMP_loop_ordered_guided_start
(
long
,
long
,
long
,
long
,
long
*,
long
*);
100
extern
bool
XOMP_loop_ordered_runtime_start
(
long
,
long
,
long
,
long
*,
long
*);
101
102
// next
103
extern
bool
XOMP_loop_static_next
(
long
*,
long
*);
104
extern
bool
XOMP_loop_dynamic_next
(
long
*,
long
*);
105
extern
bool
XOMP_loop_guided_next
(
long
*,
long
*);
106
extern
bool
XOMP_loop_runtime_next
(
long
*,
long
*);
107
108
extern
bool
XOMP_loop_ordered_static_next
(
long
*,
long
*);
109
extern
bool
XOMP_loop_ordered_dynamic_next
(
long
*,
long
*);
110
extern
bool
XOMP_loop_ordered_guided_next
(
long
*,
long
*);
111
extern
bool
XOMP_loop_ordered_runtime_next
(
long
*,
long
*);
112
113
//--------------end of loop functions
114
115
extern
void
XOMP_barrier
(
void
);
116
extern
void
XOMP_critical_start
(
void
** data);
117
extern
void
XOMP_critical_end
(
void
** data);
118
extern
bool
XOMP_single
(
void
);
119
extern
bool
XOMP_master
(
void
);
120
121
extern
void
XOMP_atomic_start
(
void
);
122
extern
void
XOMP_atomic_end
(
void
);
123
124
extern
void
XOMP_loop_end
(
void
);
125
extern
void
XOMP_loop_end_nowait
(
void
);
126
// --- end loop functions ---
127
// flush without variable list
128
extern
void
XOMP_flush_all
(
void
);
129
// omp flush with variable list, flush one by one, given each's start address and size
130
extern
void
XOMP_flush_one
(
char
* startAddress,
int
nbyte);
131
132
133
// omp ordered directive
134
extern
void
XOMP_ordered_start
(
void
);
135
extern
void
XOMP_ordered_end
(
void
);
136
137
//--------------------- extensions to support OpenMP accelerator model experimental implementation------
138
// We only include
139
//--------------------- kernel launch ------------------
140
141
// the max number of threads per thread block of the first available device
142
extern
size_t
xomp_get_maxThreadsPerBlock
();
143
144
//get the max number of 1D blocks for a given input length
145
extern
size_t
xomp_get_max1DBlock
(
size_t
ss);
146
147
// Get the max number threads for one dimension (x or y) of a 2D block
148
// Two factors are considered: the total number of threads within the 2D block must<= total threads per block
149
// x * y <= maxThreadsPerBlock 512 or 1024
150
// each dimension: the number of threads must <= maximum x/y-dimension
151
// x <= maxThreadsDim[0], 1024
152
// y <= maxThreadsDim[1], 1024
153
// maxThreadsDim[0] happens to be equal to maxThreadsDim[1] so we use a single function to calculate max segments for both dimensions
154
extern
size_t
xomp_get_max_threads_per_dimesion_2D
();
155
156
// return the max number of segments for a dimension (either x or y) of a 2D block
157
extern
size_t
xomp_get_maxSegmentsPerDimensionOf2DBlock
(
size_t
dimension_size);
158
159
//------------------memory allocation/copy/free----------------------------------
160
//Allocate device memory and return the pointer
161
// This should be a better interface than cudaMalloc()
162
// since it mimics malloc() closely
163
/*
164
return a pointer to the allocated space
165
* upon successful completion with size not equal to 0
166
return a null pointer if
167
* size is 0
168
* failure due to any reason
169
*/
170
extern
void
*
xomp_deviceMalloc
(
size_t
size);
171
172
// A host version
173
extern
void
*
xomp_hostMalloc
(
size_t
size);
174
175
//get the time stamp for now, up to microsecond resolution: 1e-6 , but maybe 1e-4 in practice
176
extern
double
xomp_time_stamp
();
177
178
179
// memory copy from src to dest, return the pointer to dest. NULL pointer if anything is wrong
180
extern
void
*
xomp_memcpyHostToDevice
(
void
*dest,
const
void
* src,
size_t
n_n);
181
extern
void
*
xomp_memcpyDeviceToHost
(
void
*dest,
const
void
* src,
size_t
n_n);
182
// copy a dynamically allocated host source array to linear dest address on a GPU device. the dimension information of the source array
183
// is given by: int dimensions[dimension_size], with known element size.
184
// bytes_copied reports the total bytes copied by this function.
185
// Note: It cannot be used copy static arrays declared like type array[N][M] !!
186
extern
void
*
xomp_memcpyDynamicHostToDevice
(
void
*dest,
const
void
* src,
int
* dimensions,
size_t
dimension_size,
size_t
element_size,
size_t
*bytes_copied);
187
188
// copy linear src memory to dynamically allocated destination, with dimension information given by
189
// int dimensions[dimension_size]
190
// the source memory has total n continuous memory, with known size for each element
191
// the total bytes copied by this function is reported by bytes_copied
192
extern
void
*
xomp_memcpyDynamicDeviceToHost
(
void
*dest,
int
* dimensions,
size_t
dimension_size,
const
void
* src,
size_t
element_size,
size_t
*bytes_copied);
193
194
extern
void
*
xomp_memcpyDeviceToDevice
(
void
*dest,
const
void
* src,
size_t
n_n);
195
extern
void
*
xomp_memcpyHostToHost
(
void
*dest,
const
void
* src,
size_t
n_n);
// same as memcpy??
196
197
198
// free the device memory pointed by a pointer, return false in case of failure, otherwise return true
199
extern
bool
xomp_freeDevice
(
void
* devPtr);
200
// free the host memory pointed by a pointer, return false in case of failure, otherwise return true
201
extern
bool
xomp_freeHost
(
void
* hostPtr);
202
203
/* Allocation/Free functions for Host */
204
/* Allocate a multi-dimensional array
205
*
206
* Input parameters:
207
* int *dimensions: an integer array storing the size of each dimension
208
* size_t dimension_num: the number of dimensions
209
* size_t esize: the size of an array element
210
*
211
* return:
212
* the pointer to the allocated array
213
* */
214
extern
void
*
xomp_mallocArray
(
int
* dimensions,
size_t
dimension_num,
size_t
esize);
215
216
extern
void
xomp_freeArrayPointer
(
void
* array,
int
* dimensions,
size_t
dimension_num);
217
218
219
/* CUDA reduction support */
220
//------------ types for CUDA reduction support---------
221
// Reduction for regular OpenMP is supported by compiler translation. No runtime support is needed.
222
// For the accelerator model experimental implementation, we use a two-level reduction method:
223
// thread-block level within GPU + beyond-block level on CPU
224
225
/* an internal union type to be flexible for all types associated with reduction operations
226
We don't really want to expose this to the compiler to simplify the compiler translation.
227
*/
228
// We try to limit the numbers of runtime data types exposed to a compiler.
229
// A set of integers to represent reduction operations
230
#define XOMP_REDUCTION_PLUS 6
231
#define XOMP_REDUCTION_MINUS 7
232
#define XOMP_REDUCTION_MUL 8
233
#define XOMP_REDUCTION_BITAND 9 // &
234
#define XOMP_REDUCTION_BITOR 10 // |
235
#define XOMP_REDUCTION_BITXOR 11 // ^
236
#define XOMP_REDUCTION_LOGAND 12 // &&
237
#define XOMP_REDUCTION_LOGOR 13 // ||
238
239
#if 0
240
// No linker support for device code. We have to put implementation of these device functions into the header
241
// TODO: wait until nvcc supports linker for device code.
242
//#define XOMP_INNER_BLOCK_REDUCTION_DECL(dtype) \
243
//__device__ void xomp_inner_block_reduction_##dtype(dtype local_value, dtype * grid_level_results, int reduction_op);
244
//
246
//XOMP_INNER_BLOCK_REDUCTION_DECL(int)
247
//XOMP_INNER_BLOCK_REDUCTION_DECL(float)
248
//XOMP_INNER_BLOCK_REDUCTION_DECL(double)
249
//
250
//#undef XOMP_INNER_BLOCK_REDUCTION_DECL
251
252
#endif
253
254
#define XOMP_BEYOND_BLOCK_REDUCTION_DECL(dtype) \
255
dtype xomp_beyond_block_reduction_##dtype(dtype * per_block_results, int numBlocks, int reduction_op);
256
257
XOMP_BEYOND_BLOCK_REDUCTION_DECL
(
int
)
258
XOMP_BEYOND_BLOCK_REDUCTION_DECL
(
float
)
259
XOMP_BEYOND_BLOCK_REDUCTION_DECL
(
double
)
260
261
#undef XOMP_BEYOND_BLOCK_REDUCTION_DECL
262
// Liao, 8/29/2013
263
// Support round-robin static scheduling of loop iterations running on GPUs (accelerator)
264
// Static even scheduling may cause each thread to touch too much data, which stress memory channel.
265
// NOT IN USE. We use compiler to generate the variables instead of using a runtime data structure.
266
struct
XOMP_accelerator_thread
{
267
int
num
;
/* the thread number of this thread in team */
268
int
num_thds
;
/* current running thread, referenced by children */
269
int
in_parallel
;
/* current thread executes the region in parallel */
270
271
/* used for schedule */
272
int
loop_chunk_size
;
//************* this is the chunk size
273
int
loop_end
;
//************* equivalent to upper limit, up
274
int
loop_sched_index
;
//************* lb+chunk_size*tp->num (num is the thread number of this thread in team)
275
int
loop_stride
;
//************* chunk_size * nthds /* used for static scheduling */
276
277
/* for 'lastprivate' */
278
int
is_last
;
279
};
280
281
#define XOMP_MAX_MAPPED_VARS 256 // for simplicity, we use preallocated memory for storing the mapped variable list
282
/* Test runtime support for nested device data environments */
283
/* Liao, May 2, 2013*/
284
/* A data structure to keep track of a mapped variable
285
* Right now we use memory address of the original variable and the size of the variable
286
* */
287
struct
XOMP_mapped_variable
288
{
289
void
*
address
;
// original variable's address
290
int
size
;
291
void
*
dev_address
;
// the corresponding device variable's address
292
bool
copyBack
;
// if this variable should be copied back to HOST
293
};
294
296
extern
void
copy_mapped_variable
(
struct
XOMP_mapped_variable
* desc,
struct
XOMP_mapped_variable
* src );
297
298
/* A doubly linked list for tracking Device Data Environment (DDE) */
299
struct
DDE_data
{
300
// Do we need this at all? we can allocate/deallocate data without saving region ID
301
int
Region_ID
;
// hash of the AST node? or just memory address of the AST node for now
302
303
// array of the newly mapped variables
304
int
new_variable_count
;
305
struct
XOMP_mapped_variable
*
new_variables
;
306
//struct XOMP_mapped_variable new_variables[XOMP_MAX_MAPPED_VARS];
307
308
// array of inherited mapped variable from possible upper level DDEs
309
int
inherited_variable_count
;
310
struct
XOMP_mapped_variable
*
inherited_variables
;
311
//struct XOMP_mapped_variable inherited_variables[XOMP_MAX_MAPPED_VARS];
312
313
// link to its parent node
314
struct
DDE_data
*
parent
;
315
// link to its child node
316
struct
DDE_data
*
child
;
317
};
318
319
// The head of the list of DDE data nodes
320
extern
struct
DDE_data
*
DDE_head
;
//TODO. We don't really need this head pointer, it is like a stack, access the end is enough
321
// The tail of the list
322
extern
struct
DDE_data
*
DDE_tail
;
323
324
// create a new DDE-data node and append it to the end of the tracking list
325
// copy all variables from its parent node to be into the set of inherited variable set.
326
//void XOMP_Device_Data_Environment_Enter();
327
extern
void
xomp_deviceDataEnvironmentEnter
();
328
329
// Check if an original variable is already mapped in enclosing data environment, return its device variable's address if yes.
330
// return NULL if not
331
//void* XOMP_Device_Data_Environment_Get_Inherited_Variable (void* original_variable_address, int size);
332
extern
void
*
xomp_deviceDataEnvironmentGetInheritedVariable
(
void
* original_variable_address,
int
size);
333
335
//void XOMP_Device_Data_Environment_Add_Variable (void* var_addr, int var_size, void * dev_addr);
336
extern
void
xomp_deviceDataEnvironmentAddVariable
(
void
* var_addr,
int
var_size,
void
* dev_addr,
bool
copy_back);
337
338
// Exit current DDE: deallocate device memory, delete the DDE-data node from the end of the tracking list
339
//void XOMP_Device_Data_Environment_Exit();
340
extern
void
xomp_deviceDataEnvironmentExit
();
341
342
343
#ifdef __cplusplus
344
}
345
#endif
346
347
#endif
/* LIB_XOMP_H */
348
349
350
351
rose-edg4x
src
midend
programTransformation
ompLowering
libxomp.h
Generated on Mon May 5 2014 17:29:24 for ROSE by
1.8.4