From 6e60b7802bb0369cc9dd8b1715073a1ff3c18f03 Mon Sep 17 00:00:00 2001 From: Edoardo Pasca Date: Fri, 24 Jan 2020 13:59:09 +0000 Subject: axpby as concrete method in DataContainer and BlockDataContainer (#489) * axpby as concrete method in DataContainer and BlockDataContainer * fixed axpby and added unittest * PDHG to use axpby * pass num_threads to axpby * void commit * add seed to random in test * NUM_THREADS can be imported from ccpi.utilities * added test to axpby with num_threads --- .../Python/ccpi/framework/BlockDataContainer.py | 47 ++++++- Wrappers/Python/ccpi/framework/framework.py | 31 +++-- .../Python/ccpi/optimisation/algorithms/PDHG.py | 16 ++- .../optimisation/operators/GradientOperator.py | 7 +- Wrappers/Python/ccpi/utilities/__init__.py | 22 +++ Wrappers/Python/test/test_BlockDataContainer.py | 155 +++++++++++++++++++++ Wrappers/Python/test/test_DataContainer.py | 15 +- Wrappers/Python/test/test_Gradient.py | 4 +- src/Core/CMakeLists.txt | 5 +- src/Core/FiniteDifferenceLibrary.c | 11 -- src/Core/axpby.c | 84 +---------- src/Core/include/FiniteDifferenceLibrary.h | 3 +- src/Core/include/axpby.h | 13 +- src/Core/include/utilities.h | 3 + src/Core/utilities.c | 14 ++ 15 files changed, 300 insertions(+), 130 deletions(-) create mode 100644 src/Core/include/utilities.h create mode 100644 src/Core/utilities.c diff --git a/Wrappers/Python/ccpi/framework/BlockDataContainer.py b/Wrappers/Python/ccpi/framework/BlockDataContainer.py index 22cee03..a0d139b 100644 --- a/Wrappers/Python/ccpi/framework/BlockDataContainer.py +++ b/Wrappers/Python/ccpi/framework/BlockDataContainer.py @@ -23,6 +23,7 @@ import numpy from numbers import Number import functools from ccpi.framework import DataContainer +from ccpi.utilities import NUM_THREADS #from ccpi.framework import AcquisitionData, ImageData #from ccpi.optimisation.operators import Operator, LinearOperator @@ -50,11 +51,12 @@ class BlockDataContainer(object): A * 3 = [ 3 * [B,C] , 3* D] = [ [ 3*B, 3*C] , 3*D ] ''' - ADD = 'add' + ADD = 'add' SUBTRACT = 'subtract' MULTIPLY = 'multiply' - DIVIDE = 'divide' - POWER = 'power' + DIVIDE = 'divide' + POWER = 'power' + AXPBY = 'axpby' __array_priority__ = 1 __container_priority__ = 2 def __init__(self, *args, **kwargs): @@ -173,7 +175,22 @@ class BlockDataContainer(object): self.binary_operations(BlockDataContainer.DIVIDE, other, *args, **kwargs) else: return self.binary_operations(BlockDataContainer.DIVIDE, other, *args, **kwargs) - + def axpby(self, a, b, y, out, dtype=numpy.float32, num_threads = NUM_THREADS): + r'''performs axpby element-wise on the BlockDataContainer containers + + Does the operation .. math:: a*x+b*y and stores the result in out, where x is self + + :param a: scalar + :param b: scalar + :param y: compatible (Block)DataContainer + :param out: (Block)DataContainer to store the result + :param dtype: optional, data type of the DataContainers + ''' + if out is None: + raise ValueError("out container cannot be None") + kwargs = {'a':a, 'b':b, 'out':out, 'dtype': dtype, 'num_threads': NUM_THREADS} + self.binary_operations(BlockDataContainer.AXPBY, y, **kwargs) + def binary_operations(self, operation, other, *args, **kwargs): '''Algebra: generic method of algebric operation with BlockDataContainer with number/DataContainer or BlockDataContainer @@ -234,11 +251,19 @@ class BlockDataContainer(object): op = el.divide elif operation == BlockDataContainer.POWER: op = el.power + elif operation == BlockDataContainer.AXPBY: + if not isinstance(other, BlockDataContainer): + raise ValueError("{} cannot handle {}".format(operation, type(other))) + op = el.axpby else: raise ValueError('Unsupported operation', operation) if out is not None: kw['out'] = out.get_item(i) - op(ot, *args, **kw) + if operation == BlockDataContainer.AXPBY: + kw['y'] = ot + el.axpby(kw['a'], kw['b'], kw['y'], kw['out'], kw['dtype'], kw['num_threads']) + else: + op(ot, *args, **kw) else: res.append(op(ot, *args, **kw)) if out is not None: @@ -249,6 +274,12 @@ class BlockDataContainer(object): else: # try to do algebra with one DataContainer. Will raise error if not compatible kw = kwargs.copy() + if operation != BlockDataContainer.AXPBY: + # remove keyworded argument related to AXPBY + for k in ['a','b','y', 'num_threads', 'dtype']: + if k in kw.keys(): + kw.pop(k) + res = [] for i,el in enumerate(self.containers): if operation == BlockDataContainer.ADD: @@ -261,6 +292,12 @@ class BlockDataContainer(object): op = el.divide elif operation == BlockDataContainer.POWER: op = el.power + elif operation == BlockDataContainer.AXPBY: + # As out cannot be None, it is safe to continue the + # for loop after the call to axpby + kw['out'] = out.get_item(i) + el.axpby(kw['a'], kw['b'], other, kw['out'], kw['dtype'], kw['num_threads']) + continue else: raise ValueError('Unsupported operation', operation) if out is not None: diff --git a/Wrappers/Python/ccpi/framework/framework.py b/Wrappers/Python/ccpi/framework/framework.py index 65121d2..6f1ed1c 100644 --- a/Wrappers/Python/ccpi/framework/framework.py +++ b/Wrappers/Python/ccpi/framework/framework.py @@ -26,6 +26,7 @@ import warnings from functools import reduce from numbers import Number import ctypes, platform +from ccpi.utilities import NUM_THREADS # dll = os.path.abspath(os.path.join( # os.path.abspath(os.path.dirname(__file__)), @@ -45,6 +46,11 @@ else: #print ("dll location", dll) cilacc = ctypes.cdll.LoadLibrary(dll) +#default nThreads +# import multiprocessing +# cpus = multiprocessing.cpu_count() +# NUM_THREADS = max(int(cpus/2),1) + def find_key(dic, val): """return the key of dictionary dic given the value""" @@ -828,24 +834,27 @@ class DataContainer(object): def minimum(self,x2, out=None, *args, **kwargs): return self.pixel_wise_binary(numpy.minimum, x2=x2, out=out, *args, **kwargs) - @staticmethod - def axpby(a,x,b,y,out,dtype=numpy.float32): + def axpby(self, a, b, y, out, dtype=numpy.float32, num_threads=NUM_THREADS): '''performs axpby with cilacc C library - Does the operation .. math:: a*x+b*y and stores the result in out + Does the operation .. math:: a*x+b*y and stores the result in out, where x is self :param a: scalar - :param x: DataContainer + :type a: float :param b: scalar + :type b: float :param y: DataContainer - :param out: DataContainer to store the result - :param dtype: optional, data type of the DataContainers + :param out: DataContainer instance to store the result + :param dtype: data type of the DataContainers + :type dtype: numpy type, optional, default numpy.float32 + :param num_threads: number of threads to run on + :type num_threads: int, optional, default 1/2 CPU of the system ''' c_float_p = ctypes.POINTER(ctypes.c_float) c_double_p = ctypes.POINTER(ctypes.c_double) # get the reference to the data - ndx = x.as_array() + ndx = self.as_array() ndy = y.as_array() ndout = out.as_array() @@ -879,15 +888,17 @@ class DataContainer(object): ctypes.POINTER(ctypes.c_float), # pointer to the third array ctypes.c_float, # type of A (float) ctypes.c_float, # type of B (float) - ctypes.c_long] # type of size of first array + ctypes.c_long, # type of size of first array + ctypes.c_int] # number of threads cilacc.daxpby.argtypes = [ctypes.POINTER(ctypes.c_double), # pointer to the first array ctypes.POINTER(ctypes.c_double), # pointer to the second array ctypes.POINTER(ctypes.c_double), # pointer to the third array ctypes.c_double, # type of A (c_double) ctypes.c_double, # type of B (c_double) - ctypes.c_long] # type of size of first array + ctypes.c_long, # type of size of first array + ctypes.c_int] # number of threads - if f(x_p, y_p, out_p, a, b, ndx.size) != 0: + if f(x_p, y_p, out_p, a, b, ndx.size, num_threads) != 0: raise RuntimeError('axpby execution failed') diff --git a/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py b/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py index cc384e3..dcb9298 100644 --- a/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py +++ b/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py @@ -130,24 +130,26 @@ class PDHG(Algorithm): # Gradient ascent for the dual variable self.operator.direct(self.xbar, out=self.y_tmp) - self.y_tmp *= self.sigma - self.y_tmp += self.y_old + # self.y_tmp *= self.sigma + # self.y_tmp += self.y_old + self.y_tmp.axpby(self.sigma, 1 , self.y_old, self.y_tmp) # self.y = self.f.proximal_conjugate(self.y_old, self.sigma) self.f.proximal_conjugate(self.y_tmp, self.sigma, out=self.y) # Gradient descent for the primal variable self.operator.adjoint(self.y, out=self.x_tmp) - self.x_tmp *= -1*self.tau - self.x_tmp += self.x_old + # self.x_tmp *= -1*self.tau + # self.x_tmp += self.x_old + self.x_tmp.axpby(-self.tau, 1. , self.x_old, self.x_tmp) self.g.proximal(self.x_tmp, self.tau, out=self.x) # Update self.x.subtract(self.x_old, out=self.xbar) - self.xbar *= self.theta - self.xbar += self.x - + # self.xbar *= self.theta + # self.xbar += self.x + self.xbar.axpby(self.theta, 1 , self.x, self.xbar) def update_objective(self): diff --git a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py index a45c3d2..a5feca3 100644 --- a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py +++ b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py @@ -22,13 +22,14 @@ from __future__ import print_function from ccpi.optimisation.operators import Operator, LinearOperator, ScaledOperator from ccpi.optimisation.operators import FiniteDiff, SparseFiniteDiff from ccpi.framework import ImageData, ImageGeometry, BlockGeometry, BlockDataContainer +from ccpi.utilities import NUM_THREADS import numpy import warnings #default nThreads -import multiprocessing -cpus = multiprocessing.cpu_count() -NUM_THREADS = max(int(cpus/2),1) +# import multiprocessing +# cpus = multiprocessing.cpu_count() +# NUM_THREADS = max(int(cpus/2),1) NEUMANN = 'Neumann' PERIODIC = 'Periodic' diff --git a/Wrappers/Python/ccpi/utilities/__init__.py b/Wrappers/Python/ccpi/utilities/__init__.py index e69de29..79eaa98 100644 --- a/Wrappers/Python/ccpi/utilities/__init__.py +++ b/Wrappers/Python/ccpi/utilities/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# This work is part of the Core Imaging Library developed by +# Visual Analytics and Imaging System Group of the Science Technology +# Facilities Council, STFC + +# Copyright 2020 Edoardo Pasca + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#default nThreads + +import multiprocessing +NUM_THREADS = max(int(multiprocessing.cpu_count()/2),1) diff --git a/Wrappers/Python/test/test_BlockDataContainer.py b/Wrappers/Python/test/test_BlockDataContainer.py index bc0e83a..a8e59b0 100755 --- a/Wrappers/Python/test/test_BlockDataContainer.py +++ b/Wrappers/Python/test/test_BlockDataContainer.py @@ -485,4 +485,159 @@ class TestBlockDataContainer(unittest.TestCase): print(err) self.assertTrue(res) + def test_axpby(self): + # test axpby between BlockDataContainers + ig0 = ImageGeometry(2,3,4) + ig1 = ImageGeometry(2,3,5) + + data0 = ig0.allocate(-1) + data2 = ig0.allocate(1) + + data1 = ig0.allocate(2) + data3 = ig0.allocate(3) + + cp0 = BlockDataContainer(data0,data2) + cp1 = BlockDataContainer(data1,data3) + + out = cp0 * 0. - 10 + + cp0.axpby(3,-2,cp1,out, num_threads=4) + + # operation should be [ 3 * -1 + (-2) * 2 , 3 * 1 + (-2) * 3 ] + # output should be [ -7 , -3 ] + res0 = ig0.allocate(-7) + res2 = ig0.allocate(-3) + res = BlockDataContainer(res0, res2) + + print ("res0", res0.as_array()) + print ("res2", res2.as_array()) + + print ("###############################") + + print ("out_0", out.get_item(0).as_array()) + print ("out_1", out.get_item(1).as_array()) + self.assertBlockDataContainerEqual(out, res) + + def test_axpby2(self): + # test axpby with BlockDataContainer and DataContainer + ig0 = ImageGeometry(2,3,4) + # ig1 = ImageGeometry(2,3,5) + + data0 = ig0.allocate(-1) + data2 = ig0.allocate(1) + + data1 = ig0.allocate(2) + # data3 = ig1.allocate(3) + + cp0 = BlockDataContainer(data0,data2) + # cp1 = BlockDataContainer(data1,data3) + + out = cp0 * 0. - 10 + + cp0.axpby(3,-2,data1,out) + + # operation should be [ 3 * -1 + (-2) * 2 , 3 * 1 + (-2) * 2 ] + # output should be [ -7 , -1 ] + res0 = ig0.allocate(-7) + res2 = ig0.allocate(-1) + res = BlockDataContainer(res0, res2) + + print ("res0", res0.as_array()) + print ("res2", res2.as_array()) + + print ("###############################") + + print ("out_0", out.get_item(0).as_array()) + print ("out_1", out.get_item(1).as_array()) + self.assertBlockDataContainerEqual(out, res) + + + def test_axpby3(self): + # test axpby with nested BlockDataContainer + ig0 = ImageGeometry(2,3,4) + ig1 = ImageGeometry(2,3,5) + + data0 = ig0.allocate(-1) + data2 = ig0.allocate(1) + + # data1 = ig0.allocate(2) + data3 = ig1.allocate(3) + + cp0 = BlockDataContainer(data0,data2) + cp1 = BlockDataContainer(cp0 *0. + [2, -2], data3) + print (cp1.get_item(0).get_item(0).as_array()) + print (cp1.get_item(0).get_item(1).as_array()) + print (cp1.get_item(1).as_array()) + print ("###############################") + + + + out = cp1 * 0. + cp2 = out + [1,3] + + print (cp2.get_item(0).get_item(0).as_array()) + print (cp2.get_item(0).get_item(1).as_array()) + print (cp2.get_item(1).as_array()) + + cp2.axpby(3,-2, cp1 ,out) + + # output should be [ [ -1 , 7 ] , 3] + res0 = ig0.allocate(-1) + res2 = ig0.allocate(7) + res3 = ig1.allocate(3) + res = BlockDataContainer(BlockDataContainer(res0, res2), res3) + + # print ("res0", res0.as_array()) + # print ("res2", res2.as_array()) + + print ("###############################") + + # print ("out_0", out.get_item(0).as_array()) + # print ("out_1", out.get_item(1).as_array()) + self.assertBlockDataContainerEqual(out, res) + + def test_axpby4(self): + # test axpby with nested BlockDataContainer + ig0 = ImageGeometry(2,3,4) + ig1 = ImageGeometry(2,3,5) + + data0 = ig0.allocate(-1) + data2 = ig0.allocate(1) + + # data1 = ig0.allocate(2) + data3 = ig1.allocate(3) + + cp0 = BlockDataContainer(data0,data2) + cp1 = BlockDataContainer(cp0 *0. + [2, -2], data3) + print (cp1.get_item(0).get_item(0).as_array()) + print (cp1.get_item(0).get_item(1).as_array()) + print (cp1.get_item(1).as_array()) + print ("###############################") + + + + out = cp1 * 0. + cp2 = out + [1,3] + + + print (cp2.get_item(0).get_item(0).as_array()) + print (cp2.get_item(0).get_item(1).as_array()) + print (cp2.get_item(1).as_array()) + + cp2.axpby(3,-2, cp1 ,out, num_threads=4) + + # output should be [ [ -1 , 7 ] , 3] + res0 = ig0.allocate(-1) + res2 = ig0.allocate(7) + res3 = ig1.allocate(3) + res = BlockDataContainer(BlockDataContainer(res0, res2), res3) + + # print ("res0", res0.as_array()) + # print ("res2", res2.as_array()) + + print ("###############################") + + # print ("out_0", out.get_item(0).as_array()) + # print ("out_1", out.get_item(1).as_array()) + self.assertBlockDataContainerEqual(out, res) diff --git a/Wrappers/Python/test/test_DataContainer.py b/Wrappers/Python/test/test_DataContainer.py index 6e297ee..4a8a6d1 100755 --- a/Wrappers/Python/test/test_DataContainer.py +++ b/Wrappers/Python/test/test_DataContainer.py @@ -740,7 +740,20 @@ class TestDataContainer(unittest.TestCase): d2 = ig.allocate(2) out = ig.allocate(None) # equals to 2 * [1] + 1 * [2] = [4] - DataContainer.axpby(2,d1,1,d2,out) + d1.axpby(2,1,d2,out) + res = numpy.ones_like(d1.as_array()) * 4. + numpy.testing.assert_array_equal(res, out.as_array()) + def test_axpby2(self): + print ("test axpby2") + N = 100 + ig = ImageGeometry(N,2*N,N*10) + d1 = ig.allocate(1) + d2 = ig.allocate(2) + out = ig.allocate(None) + print ("allocated") + # equals to 2 * [1] + 1 * [2] = [4] + d1.axpby(2,1,d2,out, num_threads=4) + print ("calculated") res = numpy.ones_like(d1.as_array()) * 4. numpy.testing.assert_array_equal(res, out.as_array()) diff --git a/Wrappers/Python/test/test_Gradient.py b/Wrappers/Python/test/test_Gradient.py index 5aeede0..78fc261 100755 --- a/Wrappers/Python/test/test_Gradient.py +++ b/Wrappers/Python/test/test_Gradient.py @@ -30,6 +30,8 @@ class TestGradient(unittest.TestCase): N, M, K = 20, 30, 40 channels = 10 + numpy.random.seed(1) + # check range geometry, examples ig1 = ImageGeometry(voxel_num_x = M, voxel_num_y = N) @@ -235,4 +237,4 @@ class TestGradient(unittest.TestCase): grad = Gradient(ig, bnd_cond='Periodic', correlation='SpaceChannels', backend='numpy') self.assertTrue(LinearOperator.dot_test(grad)) - \ No newline at end of file + diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index e828fe5..9c9a89d 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -93,8 +93,9 @@ message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}") -add_library(cilacc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/axpby.c - ${CMAKE_CURRENT_SOURCE_DIR}/FiniteDifferenceLibrary.c ) +add_library(cilacc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/utilities.c + ${CMAKE_CURRENT_SOURCE_DIR}/FiniteDifferenceLibrary.c + ${CMAKE_CURRENT_SOURCE_DIR}/axpby.c ) target_link_libraries(cilacc ${OpenMP_C_LIB_NAMES} ) include_directories(cilacc PUBLIC diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c index fbf2646..244e170 100644 --- a/src/Core/FiniteDifferenceLibrary.c +++ b/src/Core/FiniteDifferenceLibrary.c @@ -16,17 +16,6 @@ DLL_EXPORT int openMPtest(int nThreads) } return nThreads_running; } -void threads_setup(int nThreads_requested, int *nThreads_current) -{ -#pragma omp parallel - { - if (omp_get_thread_num() == 0) - { - *nThreads_current = omp_get_num_threads(); - } - } - omp_set_num_threads(nThreads_requested); -} int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc) { diff --git a/src/Core/axpby.c b/src/Core/axpby.c index c4d162d..54a597f 100755 --- a/src/Core/axpby.c +++ b/src/Core/axpby.c @@ -1,87 +1,12 @@ #include "axpby.h" -DLL_EXPORT int padd(float * x, float * y, float * out, long size){ +DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size, int nThreads){ long i = 0; -#pragma omp parallel for - for (i=0; i < size; i++) - { - *(out + i ) = *(x + i) + *(y+i); - } - return 0; -} - -DLL_EXPORT int psubtract(float * x, float * y, float * out, long size){ - long i = 0; -#pragma omp parallel -{ -//#pragma omp single -//{ -// printf("current number of threads %d\n", omp_get_num_threads()); -//} -#pragma omp for - for (i=0; i < size; i++) - { - *(out + i ) = *(x + i) - *(y+i); - } -} - return 0; - -} - -DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size){ - long i = 0; -#pragma omp parallel for - for (i=0; i < size; i++) - { - *(out + i ) = *(x + i) * *(y+i); - } - return 0; -} - -DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value) -{ - long i = 0; -#pragma omp parallel for - for (i=0; i < size; i++) - { - *(out + i ) = *(y+i) ? *(x + i) / *(y+i) : default_value; - } - return 0; -} -DLL_EXPORT int ppower(float * x, float * y, float * out, long size){ - long i = 0; -#pragma omp parallel for - for (i=0; i < size; i++) - { - *(out + i ) = (float)pow(*(x + i) , *(y+i)) ; - } - return 0; -} - -DLL_EXPORT int pminimum(float * x, float * y, float * out, long size){ - long i = 0; -#pragma omp parallel for - for (i=0; i < size; i++) - { - *(out + i ) = *(y+i) > (*x+i) ? *(x + i) : *(y+i); - } - return 0; -} - -DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size) { - long i = 0; -#pragma omp parallel for - for (i = 0; i < size; i++) - { - *(out + i) = *(y + i) < (*x + i) ? *(x + i) : *(y + i); - } - return 0; -} + int nThreads_initial; + threads_setup(nThreads, &nThreads_initial); -DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size){ - long i = 0; #pragma omp parallel { #pragma omp for @@ -90,11 +15,12 @@ DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long *(out + i ) = a * ( *(x + i) ) + b * ( *(y + i) ); } } + omp_set_num_threads(nThreads_initial); return 0; } -DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size) { +DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size, int nThreads) { long i = 0; #pragma omp parallel { diff --git a/src/Core/include/FiniteDifferenceLibrary.h b/src/Core/include/FiniteDifferenceLibrary.h index 6e426af..b8e6c4f 100644 --- a/src/Core/include/FiniteDifferenceLibrary.h +++ b/src/Core/include/FiniteDifferenceLibrary.h @@ -3,4 +3,5 @@ #include #include "omp.h" //#include "ipp.h" -#include "dll_export.h" \ No newline at end of file +#include "dll_export.h" +#include "utilities.h" \ No newline at end of file diff --git a/src/Core/include/axpby.h b/src/Core/include/axpby.h index 2849547..e13d6e1 100644 --- a/src/Core/include/axpby.h +++ b/src/Core/include/axpby.h @@ -3,15 +3,8 @@ #include #include "omp.h" #include "dll_export.h" +#include "utilities.h" -DLL_EXPORT int padd(float * x, float * y, float * out, long size); -DLL_EXPORT int psubtract(float * x, float * y, float * out, long size); -DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size); -DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value); -DLL_EXPORT int ppower(float * x, float * y, float * out, long size); -DLL_EXPORT int pminimum(float * x, float * y, float * out, long size); -DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size); - -DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size); -DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size); +DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size, int nThreads); +DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size, int nThreads); diff --git a/src/Core/include/utilities.h b/src/Core/include/utilities.h new file mode 100644 index 0000000..c3003d6 --- /dev/null +++ b/src/Core/include/utilities.h @@ -0,0 +1,3 @@ +#include "omp.h" + +void threads_setup(int nThreads_requested, int *nThreads_current); \ No newline at end of file diff --git a/src/Core/utilities.c b/src/Core/utilities.c new file mode 100644 index 0000000..86b23e8 --- /dev/null +++ b/src/Core/utilities.c @@ -0,0 +1,14 @@ +#include "utilities.h" + + +void threads_setup(int nThreads_requested, int *nThreads_current) +{ +#pragma omp parallel + { + if (omp_get_thread_num() == 0) + { + *nThreads_current = omp_get_num_threads(); + } + } + omp_set_num_threads(nThreads_requested); +} -- cgit v1.2.3