HClib  0.3
Documentation for Habanero-C Library API
 All Data Structures Functions Typedefs Enumerations Groups
forasync.c
1 /* Copyright (c) 2013, Rice University
2 
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions are
5 met:
6 
7 1. Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above
10  copyright notice, this list of conditions and the following
11  disclaimer in the documentation and/or other materials provided
12  with the distribution.
13 3. Neither the name of Rice University
14  nor the names of its contributors may be used to endorse or
15  promote products derived from this software without specific
16  prior written permission.
17 
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30  */
31 
32 #include <stdio.h>
33 #include <assert.h>
34 
35 #include "hclib.h"
36 #include "rt-hclib-def.h"
37 #include "runtime-support.h"
38 #include "runtime-hclib.h"
39 #ifdef HAVE_PHASER
40 #include "phased.h"
41 #endif
42 
43 #define DEBUG_FORASYNC 0
44 
45 void forasync1D_runner(void * forasync_arg) {
46  forasync1D_t * forasync = (forasync1D_t *) forasync_arg;
47  async_t * user = *((async_t **) forasync_arg);
48  forasync1D_Fct_t user_fct_ptr = (forasync1D_Fct_t) user->fct_ptr;
49  void * user_arg = (void *) user->arg;
50  loop_domain_t loop0 = forasync->loop0;
51  int i=0;
52  for(i=loop0.low; i<loop0.high; i+=loop0.stride) {
53  (*user_fct_ptr)(user_arg, i);
54  }
55 }
56 
57 void forasync2D_runner(void * forasync_arg) {
58  forasync2D_t * forasync = (forasync2D_t *) forasync_arg;
59  async_t * user = *((async_t **) forasync_arg);
60  forasync2D_Fct_t user_fct_ptr = (forasync2D_Fct_t) user->fct_ptr;
61  void * user_arg = (void *) user->arg;
62  loop_domain_t loop0 = forasync->loop0;
63  loop_domain_t loop1 = forasync->loop1;
64  int i=0,j=0;
65  for(i=loop0.low; i<loop0.high; i+=loop0.stride) {
66  for(j=loop1.low; j<loop1.high; j+=loop1.stride) {
67  (*user_fct_ptr)(user_arg, i, j);
68  }
69  }
70 }
71 
72 void forasync3D_runner(void * forasync_arg) {
73  forasync3D_t * forasync = (forasync3D_t *) forasync_arg;
74  async_t * user = *((async_t **) forasync_arg);
75  forasync3D_Fct_t user_fct_ptr = (forasync3D_Fct_t) user->fct_ptr;
76  void * user_arg = (void *) user->arg;
77  loop_domain_t loop0 = forasync->loop0;
78  loop_domain_t loop1 = forasync->loop1;
79  loop_domain_t loop2 = forasync->loop2;
80  int i=0,j=0,k=0;
81  for(i=loop0.low; i<loop0.high; i+=loop0.stride) {
82  for(j=loop1.low; j<loop1.high; j+=loop1.stride) {
83  for(k=loop2.low; k<loop2.high; k+=loop2.stride) {
84  (*user_fct_ptr)(user_arg, i, j, k);
85  }
86  }
87  }
88 #if DEBUG_FORASYNC
89  printf("forasync spawned %d\n", nb_spawn);
90 #endif
91 }
92 
93 void forasync1D_recursive(void * forasync_arg) {
94  forasync1D_t * forasync = (forasync1D_t *) forasync_arg;
95  loop_domain_t loop0 = forasync->loop0;
96  int high0 = loop0.high;
97  int low0 = loop0.low;
98  int stride0 = loop0.stride;
99  int tile0 = loop0.tile;
100 
101  //split the range into two, spawn a new task for the first half and recurse on the rest
102  forasync1D_task_t * new_forasync_task = NULL;
103  if((high0-low0) > tile0) {
104  int mid = (high0+low0)/2;
105  // upper-half
106  new_forasync_task = allocate_forasync1D_task();
107  new_forasync_task->task.forasync_task.def.fct_ptr = forasync1D_recursive;
108  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
109  new_forasync_task->def.base.user = forasync->base.user;
110  loop_domain_t new_loop0 = {mid, high0, stride0, tile0};
111  new_forasync_task->def.loop0 = new_loop0;
112  // update lower-half
113  forasync->loop0.high = mid;
114  // delegate scheduling to the underlying runtime
115  //TODO can we make this a special async to avoid a get_current_finish ?
116  schedule_async((async_task_t*)new_forasync_task, get_current_finish(), NO_PROP);
117  //continue to work on the half task
118  forasync1D_recursive(forasync_arg);
119  } else {
120  //compute the tile
121  forasync1D_runner(forasync_arg);
122  }
123 }
124 
125 void forasync2D_recursive(void * forasync_arg) {
126  forasync2D_t * forasync = (forasync2D_t *) forasync_arg;
127  loop_domain_t loop0 = forasync->loop0;
128  int high0 = loop0.high;
129  int low0 = loop0.low;
130  int stride0 = loop0.stride;
131  int tile0 = loop0.tile;
132  loop_domain_t loop1 = forasync->loop1;
133  int high1 = loop1.high;
134  int low1 = loop1.low;
135  int stride1 = loop1.stride;
136  int tile1 = loop1.tile;
137 
138  //split the range into two, spawn a new task for the first half and recurse on the rest
139  forasync2D_task_t * new_forasync_task = NULL;
140  if((high0-low0) > tile0) {
141  int mid = (high0+low0)/2;
142  // upper-half
143  new_forasync_task = allocate_forasync2D_task();
144  new_forasync_task->task.forasync_task.def.fct_ptr = forasync2D_recursive;
145  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
146  new_forasync_task->def.base.user = forasync->base.user;
147  loop_domain_t new_loop0 = {mid, high0, stride0, tile0};;
148  new_forasync_task->def.loop0 = new_loop0;
149  new_forasync_task->def.loop1 = loop1;
150  // update lower-half
151  forasync->loop0.high = mid;
152  } else if((high1-low1) > tile1) {
153  int mid = (high1+low1)/2;
154  // upper-half
155  new_forasync_task = allocate_forasync2D_task();
156  new_forasync_task->task.forasync_task.def.fct_ptr = forasync2D_recursive;
157  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
158  new_forasync_task->def.base.user = forasync->base.user;
159  new_forasync_task->def.loop0 = loop0;
160  loop_domain_t new_loop1 = {mid, high1, stride1, tile1};
161  new_forasync_task->def.loop1 = new_loop1;
162  // update lower-half
163  forasync->loop1.high = mid;
164  }
165  // recurse
166  if(new_forasync_task != NULL) {
167  // delegate scheduling to the underlying runtime
168  //TODO can we make this a special async to avoid a get_current_async ?
169  schedule_async((async_task_t*)new_forasync_task, get_current_finish(), NO_PROP);
170  //continue to work on the half task
171  forasync2D_recursive(forasync_arg);
172  } else { //compute the tile
173  forasync2D_runner(forasync_arg);
174  }
175 }
176 
177 void forasync3D_recursive(void * forasync_arg) {
178  forasync3D_t * forasync = (forasync3D_t *) forasync_arg;
179  loop_domain_t loop0 = forasync->loop0;
180  int high0 = loop0.high;
181  int low0 = loop0.low;
182  int stride0 = loop0.stride;
183  int tile0 = loop0.tile;
184  loop_domain_t loop1 = forasync->loop1;
185  int high1 = loop1.high;
186  int low1 = loop1.low;
187  int stride1 = loop1.stride;
188  int tile1 = loop1.tile;
189  loop_domain_t loop2 = forasync->loop2;
190  int high2 = loop2.high;
191  int low2 = loop2.low;
192  int stride2 = loop2.stride;
193  int tile2 = loop2.tile;
194 
195  //split the range into two, spawn a new task for the first half and recurse on the rest
196  forasync3D_task_t * new_forasync_task = NULL;
197  if((high0-low0) > tile0) {
198  int mid = (high0+low0)/2;
199  // upper-half
200  new_forasync_task = allocate_forasync3D_task();
201  new_forasync_task->task.forasync_task.def.fct_ptr = forasync3D_recursive;
202  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
203  new_forasync_task->def.base.user = forasync->base.user;
204  loop_domain_t new_loop0 = {mid, high0, stride0, tile0};
205  new_forasync_task->def.loop0 = new_loop0;
206  new_forasync_task->def.loop1 = loop1;
207  new_forasync_task->def.loop2 = loop2;
208  // update lower-half
209  forasync->loop0.high = mid;
210  } else if((high1-low1) > tile1) {
211  int mid = (high1+low1)/2;
212  // upper-half
213  new_forasync_task = allocate_forasync3D_task();
214  new_forasync_task->task.forasync_task.def.fct_ptr = forasync3D_recursive;
215  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
216  new_forasync_task->def.base.user = forasync->base.user;
217  new_forasync_task->def.loop0 = loop0;
218  loop_domain_t new_loop1 = {mid, high1, stride1, tile1};
219  new_forasync_task->def.loop1 = new_loop1;
220  new_forasync_task->def.loop2 = loop2;
221  // update lower-half
222  forasync->loop1.high = mid;
223  } else if((high2-low2) > tile2) {
224  int mid = (high2+low2)/2;
225  // upper-half
226  new_forasync_task = allocate_forasync3D_task();
227  new_forasync_task->task.forasync_task.def.fct_ptr = forasync3D_recursive;
228  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
229  new_forasync_task->def.base.user = forasync->base.user;
230  new_forasync_task->def.loop0 = loop0;
231  new_forasync_task->def.loop1 = loop1;
232  loop_domain_t new_loop2 = {mid, high2, stride2, tile2};
233  new_forasync_task->def.loop2 = new_loop2;
234  // update lower-half
235  forasync->loop2.high = mid;
236  }
237  // recurse
238  if(new_forasync_task != NULL) {
239  // delegate scheduling to the underlying runtime
240  //TODO can we make this a special async to avoid a get_current_async ?
241  schedule_async((async_task_t*)new_forasync_task, get_current_finish(), NO_PROP);
242  //continue to work on the half task
243  forasync3D_recursive(forasync_arg);
244  } else { //compute the tile
245  forasync3D_runner(forasync_arg);
246  }
247 }
248 
249 void forasync1D_flat(void * forasync_arg) {
250  forasync1D_t * forasync = (forasync1D_t *) forasync_arg;
251  loop_domain_t loop0 = forasync->loop0;
252  int high0 = loop0.high;
253  int stride0 = loop0.stride;
254  int tile0 = loop0.tile;
255  int nb_chunks = (int) (high0/tile0);
256  int size = tile0*nb_chunks;
257  finish_t * current_finish = get_current_finish();
258  int low0;
259  for(low0 = loop0.low; low0<size; low0+=tile0) {
260  #if DEBUG_FORASYNC
261  printf("Scheduling Task %d %d\n",low0,(low0+tile0));
262  #endif
263  //TODO block allocation ?
264  forasync1D_task_t * new_forasync_task = allocate_forasync1D_task();
265  new_forasync_task->task.forasync_task.def.fct_ptr = forasync1D_runner;
266  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
267  new_forasync_task->def.base.user = forasync->base.user;
268  loop_domain_t new_loop0 = {low0, low0+tile0, stride0, tile0};
269  new_forasync_task->def.loop0 = new_loop0;
270  schedule_async((async_task_t*)new_forasync_task, current_finish, NO_PROP);
271  }
272  // handling leftover
273  if (size < high0) {
274  #if DEBUG_FORASYNC
275  printf("Scheduling Task %d %d\n",low0,high0);
276  #endif
277  forasync1D_task_t * new_forasync_task = allocate_forasync1D_task();
278  new_forasync_task->task.forasync_task.def.fct_ptr = forasync1D_runner;
279  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
280  new_forasync_task->def.base.user = forasync->base.user;
281  loop_domain_t new_loop0 = {low0, high0, loop0.stride, loop0.tile};
282  new_forasync_task->def.loop0 = new_loop0;
283  schedule_async((async_task_t*)new_forasync_task, current_finish, NO_PROP);
284  }
285 }
286 
287 void forasync2D_flat(void * forasync_arg) {
288  forasync2D_t * forasync = (forasync2D_t *) forasync_arg;
289  loop_domain_t loop0 = forasync->loop0;
290  loop_domain_t loop1 = forasync->loop1;
291  finish_t * current_finish = get_current_finish();
292  int low0, low1;
293  for(low0=loop0.low; low0<loop0.high; low0+=loop0.tile) {
294  int high0 = (low0+loop0.tile)>loop0.high?loop0.high:(low0+loop0.tile);
295  #if DEBUG_FORASYNC
296  printf("Scheduling Task Loop1 %d %d\n",low0,high0);
297  #endif
298  for(low1=loop1.low; low1<loop1.high; low1+=loop1.tile) {
299  int high1 = (low1+loop1.tile)>loop1.high?loop1.high:(low1+loop1.tile);
300  #if DEBUG_FORASYNC
301  printf("Scheduling Task %d %d\n",low1,high1);
302  #endif
303  forasync2D_task_t * new_forasync_task = allocate_forasync2D_task();
304  new_forasync_task->task.forasync_task.def.fct_ptr = forasync2D_runner;
305  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
306  new_forasync_task->def.base.user = forasync->base.user;
307  loop_domain_t new_loop0 = {low0, high0, loop0.stride, loop0.tile};
308  new_forasync_task->def.loop0 = new_loop0;
309  loop_domain_t new_loop1 = {low1, high1, loop1.stride, loop1.tile};
310  new_forasync_task->def.loop1 = new_loop1;
311  schedule_async((async_task_t*)new_forasync_task, current_finish, NO_PROP);
312  }
313  }
314 }
315 
316 void forasync3D_flat(void * forasync_arg) {
317  forasync3D_t * forasync = (forasync3D_t *) forasync_arg;
318  loop_domain_t loop0 = forasync->loop0;
319  loop_domain_t loop1 = forasync->loop1;
320  loop_domain_t loop2 = forasync->loop2;
321  finish_t * current_finish = get_current_finish();
322  int low0, low1, low2;
323  for(low0=loop0.low; low0<loop0.high; low0+=loop0.tile) {
324  int high0 = (low0+loop0.tile)>loop0.high?loop0.high:(low0+loop0.tile);
325  #if DEBUG_FORASYNC
326  printf("Scheduling Task Loop1 %d %d\n",low0,high0);
327  #endif
328  for(low1=loop1.low; low1<loop1.high; low1+=loop1.tile) {
329  int high1 = (low1+loop1.tile)>loop1.high?loop1.high:(low1+loop1.tile);
330  #if DEBUG_FORASYNC
331  printf("Scheduling Task Loop2 %d %d\n",low1,high1);
332  #endif
333  for(low2=loop2.low; low2<loop2.high; low2+=loop2.tile) {
334  int high2 = (low2+loop2.tile)>loop2.high?loop2.high:(low2+loop2.tile);
335  #if DEBUG_FORASYNC
336  printf("Scheduling Task %d %d\n",low2,high2);
337  #endif
338  forasync3D_task_t * new_forasync_task = allocate_forasync3D_task();
339  new_forasync_task->task.forasync_task.def.fct_ptr = forasync3D_runner;
340  new_forasync_task->task.forasync_task.def.arg = &(new_forasync_task->def);
341  new_forasync_task->def.base.user = forasync->base.user;
342  loop_domain_t new_loop0 = {low0, high0, loop0.stride, loop0.tile};
343  new_forasync_task->def.loop0 = new_loop0;
344  loop_domain_t new_loop1 = {low1, high1, loop1.stride, loop1.tile};
345  new_forasync_task->def.loop1 = new_loop1;
346  loop_domain_t new_loop2 = {low2, high2, loop2.stride, loop2.tile};
347  new_forasync_task->def.loop2 = new_loop2;
348  schedule_async((async_task_t*)new_forasync_task, current_finish, NO_PROP);
349  }
350  }
351  }
352 }
353 
354 static void forasync_internal(void* user_fct_ptr, void * user_arg,
355  accumed_t * accumed,
356  int dim, loop_domain_t * loop_domain, forasync_mode_t mode) {
357  // All the sub-asyncs share async_def
358 
359  // The user loop code to execute
360  async_t user_def;
361  user_def.fct_ptr = user_fct_ptr;
362  user_def.arg = user_arg;
363 
364  start_finish();
365  if (accumed != NULL) {
366  accum_register(accumed->accums, accumed->count);
367  }
368 
369  assert(dim>0 && dim<4);
370  // TODO put those somewhere as static
371  asyncFct_t fct_ptr_rec[3] = {forasync1D_recursive, forasync2D_recursive, forasync3D_recursive};
372  asyncFct_t fct_ptr_flat[3] = {forasync1D_flat, forasync2D_flat, forasync3D_flat};
373  asyncFct_t * fct_ptr = (mode == FORASYNC_MODE_RECURSIVE) ? fct_ptr_rec : fct_ptr_flat;
374  if (dim==1) {
375  forasync1D_t forasync = {{&user_def}, loop_domain[0]};
376  (fct_ptr[dim-1])((void *) &forasync);
377  } else if(dim==2) {
378  forasync2D_t forasync = {{&user_def}, loop_domain[0], loop_domain[1]};
379  (fct_ptr[dim-1])((void *) &forasync);
380  } else if(dim==3) {
381  forasync3D_t forasync = {{&user_def}, loop_domain[0], loop_domain[1], loop_domain[2]};
382  (fct_ptr[dim-1])((void *) &forasync);
383  }
384 
385  end_finish();
386 }
387 
388 //
389 // forasync. runtime_type specifies the type of runtime (1 = recursive) (default = chunk)
390 void forasync(void* forasync_fct, void * argv, struct ddf_st ** ddf_list, struct _phased_t * phased_clause,
391  struct _accumed_t * accumed, int dim, loop_domain_t * domain, forasync_mode_t mode) {
392  forasync_internal(forasync_fct, argv, accumed, dim, domain, mode);
393 }