diff options
| -rw-r--r-- | build/msvc/gen.py | 7 | ||||
| -rw-r--r-- | cuda/3d/mem3d.cu | 19 | ||||
| -rw-r--r-- | cuda/3d/mem3d.h | 3 | ||||
| -rw-r--r-- | include/astra/CompositeGeometryManager.h | 35 | ||||
| -rw-r--r-- | matlab/mex/astra_mex_c.cpp | 45 | ||||
| -rw-r--r-- | python/astra/__init__.py | 8 | ||||
| -rw-r--r-- | python/astra/astra.py | 4 | ||||
| -rw-r--r-- | python/astra/astra_c.pyx | 21 | ||||
| -rw-r--r-- | samples/matlab/s020_3d_multiGPU.m | 38 | ||||
| -rw-r--r-- | samples/python/s019_experimental_multires.py (renamed from samples/python/s018_experimental_multires.py) | 0 | ||||
| -rw-r--r-- | samples/python/s020_3d_multiGPU.py | 57 | ||||
| -rw-r--r-- | src/CompositeGeometryManager.cpp | 787 | ||||
| -rw-r--r-- | src/ConeProjectionGeometry3D.cpp | 3 | 
13 files changed, 834 insertions, 193 deletions
| diff --git a/build/msvc/gen.py b/build/msvc/gen.py index 999710f..cc69a62 100644 --- a/build/msvc/gen.py +++ b/build/msvc/gen.py @@ -1,6 +1,8 @@  from __future__ import print_function  import sys  import os +import codecs +import six  vcppguid = "8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942" # C++ project  siguid = "2150E333-8FDC-42A3-9474-1A3956D46DE8" # project group  @@ -428,7 +430,10 @@ P_astra["files"].sort()  projects = [ P_astra, F_astra_mex, P0, P1, P2, P3, P4, P5, P6, P7, P8 ] -bom = "\xef\xbb\xbf" +if six.PY2: +  bom = "\xef\xbb\xbf" +else: +  bom = codecs.BOM_UTF8.decode("utf-8")  class Configuration:    def __init__(self, debug, cuda, x64): diff --git a/cuda/3d/mem3d.cu b/cuda/3d/mem3d.cu index 6d81dc0..0320117 100644 --- a/cuda/3d/mem3d.cu +++ b/cuda/3d/mem3d.cu @@ -62,6 +62,25 @@ size_t availableGPUMemory()  	return free;  } +int maxBlockDimension() +{ +	int dev; +	cudaError_t err = cudaGetDevice(&dev); +	if (err != cudaSuccess) { +		ASTRA_WARN("Error querying device"); +		return 0; +	} + +	cudaDeviceProp props; +	err = cudaGetDeviceProperties(&props, dev); +	if (err != cudaSuccess) { +		ASTRA_WARN("Error querying device %d properties", dev); +		return 0; +	} + +	return std::min(props.maxTexture3D[0], std::min(props.maxTexture3D[1], props.maxTexture3D[2])); +} +  MemHandle3D allocateGPUMemory(unsigned int x, unsigned int y, unsigned int z, Mem3DZeroMode zero)  {  	SMemHandle3D_internal hnd; diff --git a/cuda/3d/mem3d.h b/cuda/3d/mem3d.h index 82bad19..6fff80b 100644 --- a/cuda/3d/mem3d.h +++ b/cuda/3d/mem3d.h @@ -78,6 +78,7 @@ enum Mem3DZeroMode {  };  size_t availableGPUMemory(); +int maxBlockDimension();  MemHandle3D allocateGPUMemory(unsigned int x, unsigned int y, unsigned int z, Mem3DZeroMode zero); @@ -87,6 +88,8 @@ bool copyFromGPUMemory(float *dst, MemHandle3D src, const SSubDimensions3D &pos)  bool freeGPUMemory(MemHandle3D handle); +bool setGPUIndex(int index); +  bool FP(const astra::CProjectionGeometry3D* pProjGeom, MemHandle3D projData, const astra::CVolumeGeometry3D* pVolGeom, MemHandle3D volData, int iDetectorSuperSampling, astra::Cuda3DProjectionKernel projKernel); diff --git a/include/astra/CompositeGeometryManager.h b/include/astra/CompositeGeometryManager.h index 6610151..18dd72f 100644 --- a/include/astra/CompositeGeometryManager.h +++ b/include/astra/CompositeGeometryManager.h @@ -50,9 +50,16 @@ class CProjectionGeometry3D;  class CProjector3D; +struct SGPUParams { +	std::vector<int> GPUIndices; +	size_t memory; +}; +  class _AstraExport CCompositeGeometryManager {  public: +	CCompositeGeometryManager(); +  	class CPart;  	typedef std::list<boost::shared_ptr<CPart> > TPartList;  	class CPart { @@ -72,7 +79,9 @@ public:  		bool uploadToGPU();  		bool downloadFromGPU(/*mode?*/); -		virtual TPartList split(size_t maxSize, int div) = 0; +		virtual void splitX(TPartList& out, size_t maxSize, size_t maxDim, int div) = 0; +		virtual void splitY(TPartList& out, size_t maxSize, size_t maxDim, int div) = 0; +		virtual void splitZ(TPartList& out, size_t maxSize, size_t maxDim, int div) = 0;  		virtual CPart* reduce(const CPart *other) = 0;  		virtual void getDims(size_t &x, size_t &y, size_t &z) = 0;  		size_t getSize(); @@ -86,7 +95,9 @@ public:  		CVolumeGeometry3D* pGeom; -		virtual TPartList split(size_t maxSize, int div); +		virtual void splitX(TPartList& out, size_t maxSize, size_t maxDim, int div); +		virtual void splitY(TPartList& out, size_t maxSize, size_t maxDim, int div); +		virtual void splitZ(TPartList& out, size_t maxSize, size_t maxDim, int div);  		virtual CPart* reduce(const CPart *other);  		virtual void getDims(size_t &x, size_t &y, size_t &z); @@ -100,7 +111,9 @@ public:  		CProjectionGeometry3D* pGeom; -		virtual TPartList split(size_t maxSize, int div); +		virtual void splitX(TPartList& out, size_t maxSize, size_t maxDim, int div); +		virtual void splitY(TPartList& out, size_t maxSize, size_t maxDim, int div); +		virtual void splitZ(TPartList& out, size_t maxSize, size_t maxDim, int div);  		virtual CPart* reduce(const CPart *other);  		virtual void getDims(size_t &x, size_t &y, size_t &z); @@ -130,6 +143,13 @@ public:  	bool doJobs(TJobList &jobs); +	SJob createJobFP(CProjector3D *pProjector, +                     CFloat32VolumeData3DMemory *pVolData, +                     CFloat32ProjectionData3DMemory *pProjData); +	SJob createJobBP(CProjector3D *pProjector, +                     CFloat32VolumeData3DMemory *pVolData, +                     CFloat32ProjectionData3DMemory *pProjData); +  	// Convenience functions for creating and running a single FP or BP job  	bool doFP(CProjector3D *pProjector, CFloat32VolumeData3DMemory *pVolData,  	          CFloat32ProjectionData3DMemory *pProjData); @@ -139,10 +159,19 @@ public:  	bool doFP(CProjector3D *pProjector, const std::vector<CFloat32VolumeData3DMemory *>& volData, const std::vector<CFloat32ProjectionData3DMemory *>& projData);  	bool doBP(CProjector3D *pProjector, const std::vector<CFloat32VolumeData3DMemory *>& volData, const std::vector<CFloat32ProjectionData3DMemory *>& projData); +	void setGPUIndices(const std::vector<int>& GPUIndices); + +	static void setGlobalGPUParams(const SGPUParams& params); +  protected:  	bool splitJobs(TJobSet &jobs, size_t maxSize, int div, TJobSet &split); +	std::vector<int> m_GPUIndices; +	size_t m_iMaxSize; + + +	static SGPUParams* s_params;  };  } diff --git a/matlab/mex/astra_mex_c.cpp b/matlab/mex/astra_mex_c.cpp index d34334c..fdf4f33 100644 --- a/matlab/mex/astra_mex_c.cpp +++ b/matlab/mex/astra_mex_c.cpp @@ -38,6 +38,7 @@ $Id$  #include "astra/Globals.h"  #ifdef ASTRA_CUDA  #include "../cuda/2d/darthelper.h" +#include "astra/CompositeGeometryManager.h"  #endif  using namespace std;  using namespace astra; @@ -83,12 +84,46 @@ void astra_mex_use_cuda(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs   * Set active GPU   */  void astra_mex_set_gpu_index(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) -{  +{  #ifdef ASTRA_CUDA -	if (nrhs >= 2) { -		bool ret = astraCUDA::setGPUIndex((int)mxGetScalar(prhs[1])); -		if (!ret) -			mexPrintf("Failed to set GPU %d\n", (int)mxGetScalar(prhs[1])); +	bool usage = false; +	if (nrhs != 2 && nrhs != 4) { +		usage = true; +	} + +	astra::SGPUParams params; +	params.memory = 0; + +	if (!usage && nrhs >= 4) { +		std::string s = mexToString(prhs[2]); +		if (s != "memory") { +			usage = true; +		} else { +			params.memory = (size_t)mxGetScalar(prhs[3]); +		} +	} + +	if (!usage && nrhs >= 2) { +		int n = mxGetN(prhs[1]) * mxGetM(prhs[1]); +		params.GPUIndices.resize(n); +		double* pdMatlabData = mxGetPr(prhs[1]); +		for (int i = 0; i < n; ++i) +			params.GPUIndices[i] = (int)pdMatlabData[i]; + + +		astra::CCompositeGeometryManager::setGlobalGPUParams(params); + + +		// Set first GPU +		if (n >= 1) { +			bool ret = astraCUDA::setGPUIndex((int)pdMatlabData[0]); +			if (!ret) +				mexPrintf("Failed to set GPU %d\n", (int)pdMatlabData[0]); +		} +	} + +	if (usage) { +		mexPrintf("Usage: astra_mex('set_gpu_index', index/indices [, 'memory', memory])");  	}  #endif  } diff --git a/python/astra/__init__.py b/python/astra/__init__.py index 10ed74d..515d9a2 100644 --- a/python/astra/__init__.py +++ b/python/astra/__init__.py @@ -39,7 +39,7 @@ from . import log  from .optomo import OpTomo  import os -try: -    astra.set_gpu_index(int(os.environ['ASTRA_GPU_INDEX'])) -except KeyError: -    pass + +if 'ASTRA_GPU_INDEX' in os.environ: +    L = [ int(x) for x in os.environ['ASTRA_GPU_INDEX'].split(',') ] +    astra.set_gpu_index(L) diff --git a/python/astra/astra.py b/python/astra/astra.py index 26b1ff0..9328b6b 100644 --- a/python/astra/astra.py +++ b/python/astra/astra.py @@ -49,10 +49,10 @@ def version(printToScreen=False):      """      return a.version(printToScreen) -def set_gpu_index(idx): +def set_gpu_index(idx, memory=0):      """Set default GPU index to use.      :param idx: GPU index      :type idx: :class:`int`      """ -    a.set_gpu_index(idx) +    a.set_gpu_index(idx, memory) diff --git a/python/astra/astra_c.pyx b/python/astra/astra_c.pyx index 5075fed..65192b5 100644 --- a/python/astra/astra_c.pyx +++ b/python/astra/astra_c.pyx @@ -31,6 +31,7 @@ import six  from .utils import wrap_from_bytes  from libcpp.string cimport string +from libcpp.vector cimport vector  from libcpp cimport bool  cdef extern from "astra/Globals.h" namespace "astra":      int getVersion() @@ -43,6 +44,12 @@ IF HAVE_CUDA==True:  ELSE:    def setGPUIndex():      pass +cdef extern from "astra/CompositeGeometryManager.h" namespace "astra": +    cdef cppclass SGPUParams: +        vector[int] GPUIndices +        size_t memory +cdef extern from "astra/CompositeGeometryManager.h" namespace "astra::CCompositeGeometryManager": +    void setGlobalGPUParams(SGPUParams&)  def credits():      six.print_("""The ASTRA Toolbox has been developed at the University of Antwerp and CWI, Amsterdam by @@ -70,8 +77,16 @@ def version(printToScreen=False):      else:          return getVersion() -def set_gpu_index(idx): +def set_gpu_index(idx, memory=0): +    import types +    import collections +    cdef SGPUParams params      if use_cuda()==True: -        ret = setGPUIndex(idx) +        if not isinstance(idx, collections.Iterable) or isinstance(idx, types.StringTypes): +            idx = (idx,) +        params.memory = memory +        params.GPUIndices = idx +        setGlobalGPUParams(params) +        ret = setGPUIndex(params.GPUIndices[0])          if not ret: -            six.print_("Failed to set GPU " + str(idx)) +            six.print_("Failed to set GPU " + str(params.GPUIndices[0])) diff --git a/samples/matlab/s020_3d_multiGPU.m b/samples/matlab/s020_3d_multiGPU.m new file mode 100644 index 0000000..bade325 --- /dev/null +++ b/samples/matlab/s020_3d_multiGPU.m @@ -0,0 +1,38 @@ +% ----------------------------------------------------------------------- +% This file is part of the ASTRA Toolbox +%  +% Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp +%            2014-2015, CWI, Amsterdam +% License: Open Source under GPLv3 +% Contact: astra@uantwerpen.be +% Website: http://sf.net/projects/astra-toolbox +% ----------------------------------------------------------------------- + + +% Set up multi-GPU usage. +% This only works for 3D GPU forward projection and back projection. +astra_mex('set_gpu_index', [0 1]); + +% Optionally, you can also restrict the amount of GPU memory ASTRA will use. +% The line commented below sets this to 1GB. +%astra_mex('set_gpu_index', [0 1], 'memory', 1024*1024*1024); + +vol_geom = astra_create_vol_geom(1024, 1024, 1024); + +angles = linspace2(0, pi, 1024); +proj_geom = astra_create_proj_geom('parallel3d', 1.0, 1.0, 1024, 1024, angles); + +% Create a simple hollow cube phantom +cube = zeros(1024,1024,1024); +cube(129:896,129:896,129:896) = 1; +cube(257:768,257:768,257:768) = 0; + +% Create projection data from this +[proj_id, proj_data] = astra_create_sino3d_cuda(cube, proj_geom, vol_geom); + +% Backproject projection data +[bproj_id, bproj_data] = astra_create_backprojection3d_cuda(proj_data, proj_geom, vol_geom); + +astra_mex_data3d('delete', proj_id); +astra_mex_data3d('delete', bproj_id); + diff --git a/samples/python/s018_experimental_multires.py b/samples/python/s019_experimental_multires.py index cf38e53..cf38e53 100644 --- a/samples/python/s018_experimental_multires.py +++ b/samples/python/s019_experimental_multires.py diff --git a/samples/python/s020_3d_multiGPU.py b/samples/python/s020_3d_multiGPU.py new file mode 100644 index 0000000..d6799c4 --- /dev/null +++ b/samples/python/s020_3d_multiGPU.py @@ -0,0 +1,57 @@ +#----------------------------------------------------------------------- +#Copyright 2013 Centrum Wiskunde & Informatica, Amsterdam +# +#Author: Daniel M. Pelt +#Contact: D.M.Pelt@cwi.nl +#Website: http://dmpelt.github.io/pyastratoolbox/ +# +# +#This file is part of the Python interface to the +#All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). +# +#The Python interface to the ASTRA Toolbox is free software: you can redistribute it and/or modify +#it under the terms of the GNU General Public License as published by +#the Free Software Foundation, either version 3 of the License, or +#(at your option) any later version. +# +#The Python interface to the ASTRA Toolbox is distributed in the hope that it will be useful, +#but WITHOUT ANY WARRANTY; without even the implied warranty of +#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +#GNU General Public License for more details. +# +#You should have received a copy of the GNU General Public License +#along with the Python interface to the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. +# +#----------------------------------------------------------------------- + +import astra +import numpy as np + +# Set up multi-GPU usage. +# This only works for 3D GPU forward projection and back projection. +astra.astra.set_gpu_index([0,1]) + +# Optionally, you can also restrict the amount of GPU memory ASTRA will use. +# The line commented below sets this to 1GB. +#astra.astra.set_gpu_index([0,1], memory=1024*1024*1024) + +vol_geom = astra.create_vol_geom(1024, 1024, 1024) + +angles = np.linspace(0, np.pi, 1024,False) +proj_geom = astra.create_proj_geom('parallel3d', 1.0, 1.0, 1024, 1024, angles) + +# Create a simple hollow cube phantom +cube = np.zeros((1024,1024,1024)) +cube[128:895,128:895,128:895] = 1 +cube[256:767,256:767,256:767] = 0 + +# Create projection data from this +proj_id, proj_data = astra.create_sino3d_gpu(cube, proj_geom, vol_geom) + +# Backproject projection data +bproj_id, bproj_data = astra.create_backprojection3d_gpu(proj_data, proj_geom, vol_geom) + +# Clean up. Note that GPU memory is tied up in the algorithm object, +# and main RAM in the data objects. +astra.data3d.delete(proj_id) +astra.data3d.delete(bproj_id) diff --git a/src/CompositeGeometryManager.cpp b/src/CompositeGeometryManager.cpp index 41f6319..c9cbaaa 100644 --- a/src/CompositeGeometryManager.cpp +++ b/src/CompositeGeometryManager.cpp @@ -44,11 +44,32 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.  #include "../cuda/3d/mem3d.h"  #include <cstring> +#include <sstream> +#include <stdint.h> + +#ifndef USE_PTHREADS +#include <boost/thread/mutex.hpp> +#include <boost/thread.hpp> +#endif +  namespace astra { +SGPUParams* CCompositeGeometryManager::s_params = 0; + +CCompositeGeometryManager::CCompositeGeometryManager() +{ +	m_iMaxSize = 0; + +	if (s_params) { +		m_iMaxSize = s_params->memory; +		m_GPUIndices = s_params->GPUIndices; +	} +} + +  // JOB: -//   +//  // VolumePart  // ProjectionPart  // FP-or-BP @@ -76,9 +97,11 @@ namespace astra {  //   (First approach: 0.5/0.5) -  bool CCompositeGeometryManager::splitJobs(TJobSet &jobs, size_t maxSize, int div, TJobSet &split)  { +	int maxBlockDim = astraCUDA3d::maxBlockDimension(); +	ASTRA_DEBUG("Found max block dim %d", maxBlockDim); +  	split.clear();  	for (TJobSet::const_iterator i = jobs.begin(); i != jobs.end(); ++i) @@ -92,7 +115,22 @@ bool CCompositeGeometryManager::splitJobs(TJobSet &jobs, size_t maxSize, int div  		//    b. split input part  		//    c. create jobs for new (input,output) subparts -		TPartList splitOutput = pOutput->split(maxSize/3, div); +		TPartList splitOutput; +		pOutput->splitZ(splitOutput, maxSize/3, SIZE_MAX, div); +#if 0 +		TPartList splitOutput2; +		for (TPartList::iterator i_out = splitOutput.begin(); i_out != splitOutput.end(); ++i_out) { +			boost::shared_ptr<CPart> outputPart = *i_out; +			outputPart.get()->splitX(splitOutput2, SIZE_MAX, SIZE_MAX, 1); +		} +		splitOutput.clear(); +		for (TPartList::iterator i_out = splitOutput2.begin(); i_out != splitOutput2.end(); ++i_out) { +			boost::shared_ptr<CPart> outputPart = *i_out; +					outputPart.get()->splitY(splitOutput, SIZE_MAX, SIZE_MAX, 1); +		} +		splitOutput2.clear(); +#endif +  		for (TJobList::const_iterator j = L.begin(); j != L.end(); ++j)  		{ @@ -120,8 +158,21 @@ bool CCompositeGeometryManager::splitJobs(TJobSet &jobs, size_t maxSize, int div  				size_t remainingSize = ( maxSize - outputPart->getSize() ) / 2; -				TPartList splitInput = input->split(remainingSize, 1); +				TPartList splitInput; +				input->splitZ(splitInput, remainingSize, maxBlockDim, 1);  				delete input; +				TPartList splitInput2; +				for (TPartList::iterator i_in = splitInput.begin(); i_in != splitInput.end(); ++i_in) { +					boost::shared_ptr<CPart> inputPart = *i_in; +					inputPart.get()->splitX(splitInput2, SIZE_MAX, maxBlockDim, 1); +				} +				splitInput.clear(); +				for (TPartList::iterator i_in = splitInput2.begin(); i_in != splitInput2.end(); ++i_in) { +					boost::shared_ptr<CPart> inputPart = *i_in; +					inputPart.get()->splitY(splitInput, SIZE_MAX, maxBlockDim, 1); +				} +				splitInput2.clear(); +  				ASTRA_DEBUG("Input split into %d parts", splitInput.size());  				for (TPartList::iterator i_in = splitInput.begin(); @@ -305,37 +356,41 @@ CCompositeGeometryManager::CPart* CCompositeGeometryManager::CVolumePart::reduce  static size_t ceildiv(size_t a, size_t b) { -    return (a + b - 1) / b; +	return (a + b - 1) / b;  } -static size_t computeVerticalSplit(size_t maxBlock, int div, size_t sliceCount) +static size_t computeLinearSplit(size_t maxBlock, int div, size_t sliceCount)  { -    size_t blockSize = maxBlock; -    size_t blockCount = ceildiv(sliceCount, blockSize); - -    // Increase number of blocks to be divisible by div -    size_t divCount = div * ceildiv(blockCount, div); - -    // If divCount is above sqrt(number of slices), then -    // we can't guarantee divisibility by div, but let's try anyway -    if (ceildiv(sliceCount, ceildiv(sliceCount, divCount)) % div == 0) { -        blockCount = divCount; -    } else { -        // If divisibility isn't achievable, we may want to optimize -        // differently. -        // TODO: Figure out how to model and optimize this. -    } +	size_t blockSize = maxBlock; +	size_t blockCount; +	if (sliceCount <= blockSize) +		blockCount = 1; +	else +		blockCount = ceildiv(sliceCount, blockSize); + +	// Increase number of blocks to be divisible by div +	size_t divCount = div * ceildiv(blockCount, div); + +	// If divCount is above sqrt(number of slices), then +	// we can't guarantee divisibility by div, but let's try anyway +	if (ceildiv(sliceCount, ceildiv(sliceCount, divCount)) % div == 0) { +		blockCount = divCount; +	} else { +		// If divisibility isn't achievable, we may want to optimize +		// differently. +		// TODO: Figure out how to model and optimize this. +	} -    // Final adjustment to make blocks more evenly sized -    // (This can't make the blocks larger) -    blockSize = ceildiv(sliceCount, blockCount);  +	// Final adjustment to make blocks more evenly sized +	// (This can't make the blocks larger) +	blockSize = ceildiv(sliceCount, blockCount); -    ASTRA_DEBUG("%ld %ld -> %ld * %ld", sliceCount, maxBlock, blockCount, blockSize); +	ASTRA_DEBUG("%ld %ld -> %ld * %ld", sliceCount, maxBlock, blockCount, blockSize); -    assert(blockSize <= maxBlock); -    assert((divCount * divCount > sliceCount) || (blockCount % div) == 0); +	assert(blockSize <= maxBlock); +	assert((divCount * divCount > sliceCount) || (blockCount % div) == 0); -    return blockSize; +	return blockSize;  }  template<class V, class P> @@ -391,7 +446,17 @@ SPar3DProjection* getProjectionVectors(const CParallelVecProjectionGeometry3D* p  template<class V> -static void translateProjectionVectors(V* pProjs, int count, double dv) +static void translateProjectionVectorsU(V* pProjs, int count, double du) +{ +	for (int i = 0; i < count; ++i) { +		pProjs[i].fDetSX += du * pProjs[i].fDetUX; +		pProjs[i].fDetSY += du * pProjs[i].fDetUY; +		pProjs[i].fDetSZ += du * pProjs[i].fDetUZ; +	} +} + +template<class V> +static void translateProjectionVectorsV(V* pProjs, int count, double dv)  {  	for (int i = 0; i < count; ++i) {  		pProjs[i].fDetSX += dv * pProjs[i].fDetVX; @@ -401,8 +466,58 @@ static void translateProjectionVectors(V* pProjs, int count, double dv)  } +static CProjectionGeometry3D* getSubProjectionGeometryU(const CProjectionGeometry3D* pProjGeom, int u, int size) +{ +	// First convert to vectors, then translate, then convert into new object + +	const CConeProjectionGeometry3D* conegeom = dynamic_cast<const CConeProjectionGeometry3D*>(pProjGeom); +	const CParallelProjectionGeometry3D* par3dgeom = dynamic_cast<const CParallelProjectionGeometry3D*>(pProjGeom); +	const CParallelVecProjectionGeometry3D* parvec3dgeom = dynamic_cast<const CParallelVecProjectionGeometry3D*>(pProjGeom); +	const CConeVecProjectionGeometry3D* conevec3dgeom = dynamic_cast<const CConeVecProjectionGeometry3D*>(pProjGeom); + +	if (conegeom || conevec3dgeom) { +		SConeProjection* pConeProjs; +		if (conegeom) { +			pConeProjs = getProjectionVectors<SConeProjection>(conegeom); +		} else { +			pConeProjs = getProjectionVectors<SConeProjection>(conevec3dgeom); +		} + +		translateProjectionVectorsU(pConeProjs, pProjGeom->getProjectionCount(), u); + +		CProjectionGeometry3D* ret = new CConeVecProjectionGeometry3D(pProjGeom->getProjectionCount(), +		                                                              pProjGeom->getDetectorRowCount(), +		                                                              size, +		                                                              pConeProjs); -static CProjectionGeometry3D* getSubProjectionGeometry(const CProjectionGeometry3D* pProjGeom, int v, int size) + +		delete[] pConeProjs; +		return ret; +	} else { +		assert(par3dgeom || parvec3dgeom); +		SPar3DProjection* pParProjs; +		if (par3dgeom) { +			pParProjs = getProjectionVectors<SPar3DProjection>(par3dgeom); +		} else { +			pParProjs = getProjectionVectors<SPar3DProjection>(parvec3dgeom); +		} + +		translateProjectionVectorsU(pParProjs, pProjGeom->getProjectionCount(), u); + +		CProjectionGeometry3D* ret = new CParallelVecProjectionGeometry3D(pProjGeom->getProjectionCount(), +		                                                                  pProjGeom->getDetectorRowCount(), +		                                                                  size, +		                                                                  pParProjs); + +		delete[] pParProjs; +		return ret; +	} + +} + + + +static CProjectionGeometry3D* getSubProjectionGeometryV(const CProjectionGeometry3D* pProjGeom, int v, int size)  {  	// First convert to vectors, then translate, then convert into new object @@ -419,7 +534,7 @@ static CProjectionGeometry3D* getSubProjectionGeometry(const CProjectionGeometry  			pConeProjs = getProjectionVectors<SConeProjection>(conevec3dgeom);  		} -		translateProjectionVectors(pConeProjs, pProjGeom->getProjectionCount(), v); +		translateProjectionVectorsV(pConeProjs, pProjGeom->getProjectionCount(), v);  		CProjectionGeometry3D* ret = new CConeVecProjectionGeometry3D(pProjGeom->getProjectionCount(),  		                                                              size, @@ -438,7 +553,7 @@ static CProjectionGeometry3D* getSubProjectionGeometry(const CProjectionGeometry  			pParProjs = getProjectionVectors<SPar3DProjection>(parvec3dgeom);  		} -		translateProjectionVectors(pParProjs, pProjGeom->getProjectionCount(), v); +		translateProjectionVectorsV(pParProjs, pProjGeom->getProjectionCount(), v);  		CProjectionGeometry3D* ret = new CParallelVecProjectionGeometry3D(pProjGeom->getProjectionCount(),  		                                                                  size, @@ -457,17 +572,110 @@ static CProjectionGeometry3D* getSubProjectionGeometry(const CProjectionGeometry  // - each no bigger than maxSize  // - number of sub-parts is divisible by div  // - maybe all approximately the same size? -CCompositeGeometryManager::TPartList CCompositeGeometryManager::CVolumePart::split(size_t maxSize, int div) +void CCompositeGeometryManager::CVolumePart::splitX(CCompositeGeometryManager::TPartList& out, size_t maxSize, size_t maxDim, int div)  { -	TPartList ret; +	if (true) { +		// Split in vertical direction only at first, until we figure out +		// a model for splitting in other directions + +		size_t sliceSize = ((size_t) pGeom->getGridSliceCount()) * pGeom->getGridRowCount(); +		int sliceCount = pGeom->getGridColCount(); +		size_t m = std::min(maxSize / sliceSize, maxDim); +		size_t blockSize = computeLinearSplit(m, div, sliceCount); + +		int rem = sliceCount % blockSize; + +		ASTRA_DEBUG("From %d to %d step %d", -(rem / 2), sliceCount, blockSize); + +		for (int x = -(rem / 2); x < sliceCount; x += blockSize) { +			int newsubX = x; +			if (newsubX < 0) newsubX = 0; +			int endX = x + blockSize; +			if (endX > sliceCount) endX = sliceCount; +			int size = endX - newsubX; + +			CVolumePart *sub = new CVolumePart(); +			sub->subX = this->subX + newsubX; +			sub->subY = this->subY; +			sub->subZ = this->subZ; + +			ASTRA_DEBUG("VolumePart split %d %d %d -> %p", sub->subX, sub->subY, sub->subZ, (void*)sub); + +			double shift = pGeom->getPixelLengthX() * newsubX; + +			sub->pData = pData; +			sub->pGeom = new CVolumeGeometry3D(size, +			                                   pGeom->getGridRowCount(), +			                                   pGeom->getGridSliceCount(), +			                                   pGeom->getWindowMinX() + shift, +			                                   pGeom->getWindowMinY(), +			                                   pGeom->getWindowMinZ(), +			                                   pGeom->getWindowMinX() + shift + size * pGeom->getPixelLengthX(), +			                                   pGeom->getWindowMaxY(), +			                                   pGeom->getWindowMaxZ()); + +			out.push_back(boost::shared_ptr<CPart>(sub)); +		} +	} +} +void CCompositeGeometryManager::CVolumePart::splitY(CCompositeGeometryManager::TPartList& out, size_t maxSize, size_t maxDim, int div) +{ +	if (true) { +		// Split in vertical direction only at first, until we figure out +		// a model for splitting in other directions + +		size_t sliceSize = ((size_t) pGeom->getGridColCount()) * pGeom->getGridSliceCount(); +		int sliceCount = pGeom->getGridRowCount(); +		size_t m = std::min(maxSize / sliceSize, maxDim); +		size_t blockSize = computeLinearSplit(m, div, sliceCount); + +		int rem = sliceCount % blockSize; + +		ASTRA_DEBUG("From %d to %d step %d", -(rem / 2), sliceCount, blockSize); + +		for (int y = -(rem / 2); y < sliceCount; y += blockSize) { +			int newsubY = y; +			if (newsubY < 0) newsubY = 0; +			int endY = y + blockSize; +			if (endY > sliceCount) endY = sliceCount; +			int size = endY - newsubY; + +			CVolumePart *sub = new CVolumePart(); +			sub->subX = this->subX; +			sub->subY = this->subY + newsubY; +			sub->subZ = this->subZ; + +			ASTRA_DEBUG("VolumePart split %d %d %d -> %p", sub->subX, sub->subY, sub->subZ, (void*)sub); + +			double shift = pGeom->getPixelLengthY() * newsubY; + +			sub->pData = pData; +			sub->pGeom = new CVolumeGeometry3D(pGeom->getGridColCount(), +			                                   size, +			                                   pGeom->getGridSliceCount(), +			                                   pGeom->getWindowMinX(), +			                                   pGeom->getWindowMinY() + shift, +			                                   pGeom->getWindowMinZ(), +			                                   pGeom->getWindowMaxX(), +			                                   pGeom->getWindowMinY() + shift + size * pGeom->getPixelLengthY(), +			                                   pGeom->getWindowMaxZ()); + +			out.push_back(boost::shared_ptr<CPart>(sub)); +		} +	} +} + +void CCompositeGeometryManager::CVolumePart::splitZ(CCompositeGeometryManager::TPartList& out, size_t maxSize, size_t maxDim, int div) +{  	if (true) {  		// Split in vertical direction only at first, until we figure out  		// a model for splitting in other directions  		size_t sliceSize = ((size_t) pGeom->getGridColCount()) * pGeom->getGridRowCount();  		int sliceCount = pGeom->getGridSliceCount(); -		size_t blockSize = computeVerticalSplit(maxSize / sliceSize, div, sliceCount); +		size_t m = std::min(maxSize / sliceSize, maxDim); +		size_t blockSize = computeLinearSplit(m, div, sliceCount);  		int rem = sliceCount % blockSize; @@ -500,11 +708,9 @@ CCompositeGeometryManager::TPartList CCompositeGeometryManager::CVolumePart::spl  			                                   pGeom->getWindowMaxY(),  			                                   pGeom->getWindowMinZ() + shift + size * pGeom->getPixelLengthZ()); -			ret.push_back(boost::shared_ptr<CPart>(sub)); +			out.push_back(boost::shared_ptr<CPart>(sub));  		}  	} - -	return ret;  }  CCompositeGeometryManager::CVolumePart* CCompositeGeometryManager::CVolumePart::clone() const @@ -611,7 +817,7 @@ CCompositeGeometryManager::CPart* CCompositeGeometryManager::CProjectionPart::re  	if (_vmin == _vmax) {  		sub->pGeom = 0;  	} else { -		sub->pGeom = getSubProjectionGeometry(pGeom, _vmin, _vmax - _vmin); +		sub->pGeom = getSubProjectionGeometryV(pGeom, _vmin, _vmax - _vmin);  	}  	ASTRA_DEBUG("Reduce projection from %d - %d to %d - %d", this->subZ, this->subZ + pGeom->getDetectorRowCount(), this->subZ + _vmin, this->subZ + _vmax); @@ -620,17 +826,58 @@ CCompositeGeometryManager::CPart* CCompositeGeometryManager::CProjectionPart::re  } -CCompositeGeometryManager::TPartList CCompositeGeometryManager::CProjectionPart::split(size_t maxSize, int div) +void CCompositeGeometryManager::CProjectionPart::splitX(CCompositeGeometryManager::TPartList &out, size_t maxSize, size_t maxDim, int div)  { -	TPartList ret; +	if (true) { +		// Split in vertical direction only at first, until we figure out +		// a model for splitting in other directions +		size_t sliceSize = ((size_t) pGeom->getDetectorRowCount()) * pGeom->getProjectionCount(); +		int sliceCount = pGeom->getDetectorColCount(); +		size_t m = std::min(maxSize / sliceSize, maxDim); +		size_t blockSize = computeLinearSplit(m, div, sliceCount); + +		int rem = sliceCount % blockSize; + +		for (int x = -(rem / 2); x < sliceCount; x += blockSize) { +			int newsubX = x; +			if (newsubX < 0) newsubX = 0; +			int endX = x + blockSize; +			if (endX > sliceCount) endX = sliceCount; +			int size = endX - newsubX; + +			CProjectionPart *sub = new CProjectionPart(); +			sub->subX = this->subX + newsubX; +			sub->subY = this->subY; +			sub->subZ = this->subZ; + +			ASTRA_DEBUG("ProjectionPart split %d %d %d -> %p", sub->subX, sub->subY, sub->subZ, (void*)sub); + +			sub->pData = pData; + +			sub->pGeom = getSubProjectionGeometryU(pGeom, newsubX, size); + +			out.push_back(boost::shared_ptr<CPart>(sub)); +		} +	} +} + +void CCompositeGeometryManager::CProjectionPart::splitY(CCompositeGeometryManager::TPartList &out, size_t maxSize, size_t maxDim, int div) +{ +	// TODO +	out.push_back(boost::shared_ptr<CPart>(clone())); +} + +void CCompositeGeometryManager::CProjectionPart::splitZ(CCompositeGeometryManager::TPartList &out, size_t maxSize, size_t maxDim, int div) +{  	if (true) {  		// Split in vertical direction only at first, until we figure out  		// a model for splitting in other directions  		size_t sliceSize = ((size_t) pGeom->getDetectorColCount()) * pGeom->getProjectionCount();  		int sliceCount = pGeom->getDetectorRowCount(); -		size_t blockSize = computeVerticalSplit(maxSize / sliceSize, div, sliceCount); +		size_t m = std::min(maxSize / sliceSize, maxDim); +		size_t blockSize = computeLinearSplit(m, div, sliceCount);  		int rem = sliceCount % blockSize; @@ -650,14 +897,12 @@ CCompositeGeometryManager::TPartList CCompositeGeometryManager::CProjectionPart:  			sub->pData = pData; -			sub->pGeom = getSubProjectionGeometry(pGeom, newsubZ, size); +			sub->pGeom = getSubProjectionGeometryV(pGeom, newsubZ, size); -			ret.push_back(boost::shared_ptr<CPart>(sub)); +			out.push_back(boost::shared_ptr<CPart>(sub));  		}  	} -	return ret; -  }  CCompositeGeometryManager::CProjectionPart* CCompositeGeometryManager::CProjectionPart::clone() const @@ -665,13 +910,12 @@ CCompositeGeometryManager::CProjectionPart* CCompositeGeometryManager::CProjecti  	return new CProjectionPart(*this);  } - -bool CCompositeGeometryManager::doFP(CProjector3D *pProjector, CFloat32VolumeData3DMemory *pVolData, -                                     CFloat32ProjectionData3DMemory *pProjData) +CCompositeGeometryManager::SJob CCompositeGeometryManager::createJobFP(CProjector3D *pProjector, +                                            CFloat32VolumeData3DMemory *pVolData, +                                            CFloat32ProjectionData3DMemory *pProjData)  { -	ASTRA_DEBUG("CCompositeGeometryManager::doFP"); +	ASTRA_DEBUG("CCompositeGeometryManager::createJobFP");  	// Create single job for FP -	// Run result  	CVolumePart *input = new CVolumePart();  	input->pData = pVolData; @@ -696,18 +940,15 @@ bool CCompositeGeometryManager::doFP(CProjector3D *pProjector, CFloat32VolumeDat  	FP.eType = SJob::JOB_FP;  	FP.eMode = SJob::MODE_SET; -	TJobList L; -	L.push_back(FP); - -	return doJobs(L); +	return FP;  } -bool CCompositeGeometryManager::doBP(CProjector3D *pProjector, CFloat32VolumeData3DMemory *pVolData, -                                     CFloat32ProjectionData3DMemory *pProjData) +CCompositeGeometryManager::SJob CCompositeGeometryManager::createJobBP(CProjector3D *pProjector, +                                            CFloat32VolumeData3DMemory *pVolData, +                                            CFloat32ProjectionData3DMemory *pProjData)  { -	ASTRA_DEBUG("CCompositeGeometryManager::doBP"); +	ASTRA_DEBUG("CCompositeGeometryManager::createJobBP");  	// Create single job for BP -	// Run result  	CProjectionPart *input = new CProjectionPart();  	input->pData = pProjData; @@ -730,8 +971,23 @@ bool CCompositeGeometryManager::doBP(CProjector3D *pProjector, CFloat32VolumeDat  	BP.eType = SJob::JOB_BP;  	BP.eMode = SJob::MODE_SET; +	return BP; +} + +bool CCompositeGeometryManager::doFP(CProjector3D *pProjector, CFloat32VolumeData3DMemory *pVolData, +                                     CFloat32ProjectionData3DMemory *pProjData) +{ +	TJobList L; +	L.push_back(createJobFP(pProjector, pVolData, pProjData)); + +	return doJobs(L); +} + +bool CCompositeGeometryManager::doBP(CProjector3D *pProjector, CFloat32VolumeData3DMemory *pVolData, +                                     CFloat32ProjectionData3DMemory *pProjData) +{  	TJobList L; -	L.push_back(BP); +	L.push_back(createJobBP(pProjector, pVolData, pProjData));  	return doJobs(L);  } @@ -848,6 +1104,260 @@ bool CCompositeGeometryManager::doBP(CProjector3D *pProjector, const std::vector +static bool doJob(const CCompositeGeometryManager::TJobSet::const_iterator& iter) +{ +	CCompositeGeometryManager::CPart* output = iter->first; +	const CCompositeGeometryManager::TJobList& L = iter->second; + +	assert(!L.empty()); + +	bool zero = L.begin()->eMode == CCompositeGeometryManager::SJob::MODE_SET; + +	size_t outx, outy, outz; +	output->getDims(outx, outy, outz); + +	if (L.begin()->eType == CCompositeGeometryManager::SJob::JOB_NOP) { +		// just zero output? +		if (zero) { +			for (size_t z = 0; z < outz; ++z) { +				for (size_t y = 0; y < outy; ++y) { +					float* ptr = output->pData->getData(); +					ptr += (z + output->subX) * (size_t)output->pData->getHeight() * (size_t)output->pData->getWidth(); +					ptr += (y + output->subY) * (size_t)output->pData->getWidth(); +					ptr += output->subX; +					memset(ptr, 0, sizeof(float) * outx); +				} +			} +		} +		return true; +	} + + +	astraCUDA3d::SSubDimensions3D dstdims; +	dstdims.nx = output->pData->getWidth(); +	dstdims.pitch = dstdims.nx; +	dstdims.ny = output->pData->getHeight(); +	dstdims.nz = output->pData->getDepth(); +	dstdims.subnx = outx; +	dstdims.subny = outy; +	dstdims.subnz = outz; +	ASTRA_DEBUG("dstdims: %d,%d,%d in %d,%d,%d", dstdims.subnx, dstdims.subny, dstdims.subnz, dstdims.nx, dstdims.ny, dstdims.nz); +	dstdims.subx = output->subX; +	dstdims.suby = output->subY; +	dstdims.subz = output->subZ; +	float *dst = output->pData->getData(); + +	astraCUDA3d::MemHandle3D outputMem = astraCUDA3d::allocateGPUMemory(outx, outy, outz, zero ? astraCUDA3d::INIT_ZERO : astraCUDA3d::INIT_NO); +	bool ok = outputMem; + +	for (CCompositeGeometryManager::TJobList::const_iterator i = L.begin(); i != L.end(); ++i) { +		const CCompositeGeometryManager::SJob &j = *i; + +		assert(j.pInput); + +		CCudaProjector3D *projector = dynamic_cast<CCudaProjector3D*>(j.pProjector); +		Cuda3DProjectionKernel projKernel = ker3d_default; +		int detectorSuperSampling = 1; +		int voxelSuperSampling = 1; +		if (projector) { +			projKernel = projector->getProjectionKernel(); +			detectorSuperSampling = projector->getDetectorSuperSampling(); +			voxelSuperSampling = projector->getVoxelSuperSampling(); +		} + +		size_t inx, iny, inz; +		j.pInput->getDims(inx, iny, inz); +		astraCUDA3d::MemHandle3D inputMem = astraCUDA3d::allocateGPUMemory(inx, iny, inz, astraCUDA3d::INIT_NO); + +		astraCUDA3d::SSubDimensions3D srcdims; +		srcdims.nx = j.pInput->pData->getWidth(); +		srcdims.pitch = srcdims.nx; +		srcdims.ny = j.pInput->pData->getHeight(); +		srcdims.nz = j.pInput->pData->getDepth(); +		srcdims.subnx = inx; +		srcdims.subny = iny; +		srcdims.subnz = inz; +		srcdims.subx = j.pInput->subX; +		srcdims.suby = j.pInput->subY; +		srcdims.subz = j.pInput->subZ; +		const float *src = j.pInput->pData->getDataConst(); + +		ok = astraCUDA3d::copyToGPUMemory(src, inputMem, srcdims); +		if (!ok) ASTRA_ERROR("Error copying input data to GPU"); + +		if (j.eType == CCompositeGeometryManager::SJob::JOB_FP) { +			assert(dynamic_cast<CCompositeGeometryManager::CVolumePart*>(j.pInput.get())); +			assert(dynamic_cast<CCompositeGeometryManager::CProjectionPart*>(j.pOutput.get())); + +			ASTRA_DEBUG("CCompositeGeometryManager::doJobs: doing FP"); + +			ok = astraCUDA3d::FP(((CCompositeGeometryManager::CProjectionPart*)j.pOutput.get())->pGeom, outputMem, ((CCompositeGeometryManager::CVolumePart*)j.pInput.get())->pGeom, inputMem, detectorSuperSampling, projKernel); +			if (!ok) ASTRA_ERROR("Error performing sub-FP"); +			ASTRA_DEBUG("CCompositeGeometryManager::doJobs: FP done"); +		} else if (j.eType == CCompositeGeometryManager::SJob::JOB_BP) { +			assert(dynamic_cast<CCompositeGeometryManager::CVolumePart*>(j.pOutput.get())); +			assert(dynamic_cast<CCompositeGeometryManager::CProjectionPart*>(j.pInput.get())); + +			ASTRA_DEBUG("CCompositeGeometryManager::doJobs: doing BP"); + +			ok = astraCUDA3d::BP(((CCompositeGeometryManager::CProjectionPart*)j.pInput.get())->pGeom, inputMem, ((CCompositeGeometryManager::CVolumePart*)j.pOutput.get())->pGeom, outputMem, voxelSuperSampling); +			if (!ok) ASTRA_ERROR("Error performing sub-BP"); +			ASTRA_DEBUG("CCompositeGeometryManager::doJobs: BP done"); +		} else { +			assert(false); +		} + +		ok = astraCUDA3d::freeGPUMemory(inputMem); +		if (!ok) ASTRA_ERROR("Error freeing GPU memory"); + +	} + +	ok = astraCUDA3d::copyFromGPUMemory(dst, outputMem, dstdims); +	if (!ok) ASTRA_ERROR("Error copying output data from GPU"); +	 +	ok = astraCUDA3d::freeGPUMemory(outputMem); +	if (!ok) ASTRA_ERROR("Error freeing GPU memory"); + +	return true; +} + + +class WorkQueue { +public: +	WorkQueue(CCompositeGeometryManager::TJobSet &_jobs) : m_jobs(_jobs) { +#ifdef USE_PTHREADS +		pthread_mutex_init(&m_mutex, 0); +#endif +		m_iter = m_jobs.begin(); +	} +	bool receive(CCompositeGeometryManager::TJobSet::const_iterator &i) { +		lock(); + +		if (m_iter == m_jobs.end()) { +			unlock(); +			return false; +		} + +		i = m_iter++; + +		unlock(); + +		return true;	 +	} +#ifdef USE_PTHREADS +	void lock() { +		// TODO: check mutex op return values +		pthread_mutex_lock(&m_mutex); +	} +	void unlock() { +		// TODO: check mutex op return values +		pthread_mutex_unlock(&m_mutex); +	} +#else +	void lock() { +		m_mutex.lock(); +	} +	void unlock() { +		m_mutex.unlock(); +	} +#endif + +private: +	CCompositeGeometryManager::TJobSet &m_jobs; +	CCompositeGeometryManager::TJobSet::const_iterator m_iter; +#ifdef USE_PTHREADS +	pthread_mutex_t m_mutex; +#else +	boost::mutex m_mutex; +#endif +}; + +struct WorkThreadInfo { +	WorkQueue* m_queue; +	unsigned int m_iGPU; +}; + +#ifndef USE_PTHREADS + +void runEntries_boost(WorkThreadInfo* info) +{ +	ASTRA_DEBUG("Launching thread on GPU %d\n", info->m_iGPU); +	CCompositeGeometryManager::TJobSet::const_iterator i; +	while (info->m_queue->receive(i)) { +		ASTRA_DEBUG("Running block on GPU %d\n", info->m_iGPU); +		astraCUDA3d::setGPUIndex(info->m_iGPU); +		boost::this_thread::interruption_point(); +		doJob(i); +		boost::this_thread::interruption_point(); +	} +	ASTRA_DEBUG("Finishing thread on GPU %d\n", info->m_iGPU); +} + + +#else + +void* runEntries_pthreads(void* data) { +	WorkThreadInfo* info = (WorkThreadInfo*)data; + +	ASTRA_DEBUG("Launching thread on GPU %d\n", info->m_iGPU); + +	CCompositeGeometryManager::TJobSet::const_iterator i; + +	while (info->m_queue->receive(i)) { +		ASTRA_DEBUG("Running block on GPU %d\n", info->m_iGPU); +		astraCUDA3d::setGPUIndex(info->m_iGPU); +		pthread_testcancel(); +		doJob(i); +		pthread_testcancel(); +	} +	ASTRA_DEBUG("Finishing thread on GPU %d\n", info->m_iGPU); + +	return 0; +} + +#endif + + +void runWorkQueue(WorkQueue &queue, const std::vector<int> & iGPUIndices) { +	int iThreadCount = iGPUIndices.size(); + +	std::vector<WorkThreadInfo> infos; +#ifdef USE_PTHREADS +	std::vector<pthread_t> threads; +#else +	std::vector<boost::thread*> threads; +#endif +	infos.resize(iThreadCount); +	threads.resize(iThreadCount); + +	for (int i = 0; i < iThreadCount; ++i) { +		infos[i].m_queue = &queue; +		infos[i].m_iGPU = iGPUIndices[i]; +#ifdef USE_PTHREADS +		pthread_create(&threads[i], 0, runEntries_pthreads, (void*)&infos[i]); +#else +		threads[i] = new boost::thread(runEntries_boost, &infos[i]); +#endif +	} + +	// Wait for them to finish +	for (int i = 0; i < iThreadCount; ++i) { +#ifdef USE_PTHREADS +		pthread_join(threads[i], 0); +#else +		threads[i]->join(); +		delete threads[i]; +		threads[i] = 0; +#endif +	} +} + + +void CCompositeGeometryManager::setGPUIndices(const std::vector<int>& GPUIndices) +{ +	m_GPUIndices = GPUIndices; +} +  bool CCompositeGeometryManager::doJobs(TJobList &jobs)  {  	ASTRA_DEBUG("CCompositeGeometryManager::doJobs"); @@ -859,140 +1369,53 @@ bool CCompositeGeometryManager::doJobs(TJobList &jobs)  		jobset[i->pOutput.get()].push_back(*i);  	} -	size_t maxSize = astraCUDA3d::availableGPUMemory(); +	size_t maxSize = m_iMaxSize;  	if (maxSize == 0) { -		ASTRA_WARN("Unable to get available GPU memory. Defaulting to 1GB."); -		maxSize = 1024 * 1024 * 1024; +		// Get memory from first GPU. Not optimal... +		if (!m_GPUIndices.empty()) +			astraCUDA3d::setGPUIndex(m_GPUIndices[0]); +		maxSize = astraCUDA3d::availableGPUMemory(); +		if (maxSize == 0) { +			ASTRA_WARN("Unable to get available GPU memory. Defaulting to 1GB."); +			maxSize = 1024 * 1024 * 1024; +		} else { +			ASTRA_DEBUG("Detected %lu bytes of GPU memory", maxSize); +		}  	} else { -		ASTRA_DEBUG("Detected %lu bytes of GPU memory", maxSize); +		ASTRA_DEBUG("Set to %lu bytes of GPU memory", maxSize);  	}  	maxSize = (maxSize * 9) / 10;  	maxSize /= sizeof(float);  	int div = 1; - -	// TODO: Multi-GPU support +	if (!m_GPUIndices.empty()) +		div = m_GPUIndices.size();  	// Split jobs to fit  	TJobSet split;  	splitJobs(jobset, maxSize, div, split);  	jobset.clear(); -	// Run jobs -	 -	for (TJobSet::iterator iter = split.begin(); iter != split.end(); ++iter) { - -		CPart* output = iter->first; -		TJobList& L = iter->second; +	if (m_GPUIndices.size() <= 1) { -		assert(!L.empty()); +		// Run jobs +		ASTRA_DEBUG("Running single-threaded"); -		bool zero = L.begin()->eMode == SJob::MODE_SET; +		if (!m_GPUIndices.empty()) +			astraCUDA3d::setGPUIndex(m_GPUIndices[0]); -		size_t outx, outy, outz; -		output->getDims(outx, outy, outz); - -		if (L.begin()->eType == SJob::JOB_NOP) { -			// just zero output? -			if (zero) { -				for (size_t z = 0; z < outz; ++z) { -					for (size_t y = 0; y < outy; ++y) { -						float* ptr = output->pData->getData(); -						ptr += (z + output->subX) * (size_t)output->pData->getHeight() * (size_t)output->pData->getWidth(); -						ptr += (y + output->subY) * (size_t)output->pData->getWidth(); -						ptr += output->subX; -						memset(ptr, 0, sizeof(float) * outx); -					} -				} -			} -			continue; +		for (TJobSet::const_iterator iter = split.begin(); iter != split.end(); ++iter) { +			doJob(iter);  		} +	} else { -		astraCUDA3d::SSubDimensions3D dstdims; -		dstdims.nx = output->pData->getWidth(); -		dstdims.pitch = dstdims.nx; -		dstdims.ny = output->pData->getHeight(); -		dstdims.nz = output->pData->getDepth(); -		dstdims.subnx = outx; -		dstdims.subny = outy; -		dstdims.subnz = outz; -		ASTRA_DEBUG("dstdims: %d,%d,%d in %d,%d,%d", dstdims.subnx, dstdims.subny, dstdims.subnz, dstdims.nx, dstdims.ny, dstdims.nz); -		dstdims.subx = output->subX; -		dstdims.suby = output->subY; -		dstdims.subz = output->subZ; -		float *dst = output->pData->getData(); - -		astraCUDA3d::MemHandle3D outputMem = astraCUDA3d::allocateGPUMemory(outx, outy, outz, zero ? astraCUDA3d::INIT_ZERO : astraCUDA3d::INIT_NO); -		bool ok = outputMem; - -		for (TJobList::iterator i = L.begin(); i != L.end(); ++i) { -			SJob &j = *i; - -			assert(j.pInput); - -			CCudaProjector3D *projector = dynamic_cast<CCudaProjector3D*>(j.pProjector); -			Cuda3DProjectionKernel projKernel = ker3d_default; -			int detectorSuperSampling = 1; -			int voxelSuperSampling = 1; -			if (projector) { -				projKernel = projector->getProjectionKernel(); -				detectorSuperSampling = projector->getDetectorSuperSampling(); -				voxelSuperSampling = projector->getVoxelSuperSampling(); -			} - -			size_t inx, iny, inz; -			j.pInput->getDims(inx, iny, inz); -			astraCUDA3d::MemHandle3D inputMem = astraCUDA3d::allocateGPUMemory(inx, iny, inz, astraCUDA3d::INIT_NO); - -			astraCUDA3d::SSubDimensions3D srcdims; -			srcdims.nx = j.pInput->pData->getWidth(); -			srcdims.pitch = srcdims.nx; -			srcdims.ny = j.pInput->pData->getHeight(); -			srcdims.nz = j.pInput->pData->getDepth(); -			srcdims.subnx = inx; -			srcdims.subny = iny; -			srcdims.subnz = inz; -			srcdims.subx = j.pInput->subX; -			srcdims.suby = j.pInput->subY; -			srcdims.subz = j.pInput->subZ; -			const float *src = j.pInput->pData->getDataConst(); - -			ok = astraCUDA3d::copyToGPUMemory(src, inputMem, srcdims); -			if (!ok) ASTRA_ERROR("Error copying input data to GPU"); - -			if (j.eType == SJob::JOB_FP) { -				assert(dynamic_cast<CVolumePart*>(j.pInput.get())); -				assert(dynamic_cast<CProjectionPart*>(j.pOutput.get())); - -				ASTRA_DEBUG("CCompositeGeometryManager::doJobs: doing FP"); - -				ok = astraCUDA3d::FP(((CProjectionPart*)j.pOutput.get())->pGeom, outputMem, ((CVolumePart*)j.pInput.get())->pGeom, inputMem, detectorSuperSampling, projKernel); -				if (!ok) ASTRA_ERROR("Error performing sub-FP"); -				ASTRA_DEBUG("CCompositeGeometryManager::doJobs: FP done"); -			} else if (j.eType == SJob::JOB_BP) { -				assert(dynamic_cast<CVolumePart*>(j.pOutput.get())); -				assert(dynamic_cast<CProjectionPart*>(j.pInput.get())); - -				ASTRA_DEBUG("CCompositeGeometryManager::doJobs: doing BP"); - -				ok = astraCUDA3d::BP(((CProjectionPart*)j.pInput.get())->pGeom, inputMem, ((CVolumePart*)j.pOutput.get())->pGeom, outputMem, voxelSuperSampling); -				if (!ok) ASTRA_ERROR("Error performing sub-BP"); -				ASTRA_DEBUG("CCompositeGeometryManager::doJobs: BP done"); -			} else { -				assert(false); -			} +		ASTRA_DEBUG("Running multi-threaded"); -			ok = astraCUDA3d::freeGPUMemory(inputMem); -			if (!ok) ASTRA_ERROR("Error freeing GPU memory"); +		WorkQueue wq(split); -		} +		runWorkQueue(wq, m_GPUIndices); -		ok = astraCUDA3d::copyFromGPUMemory(dst, outputMem, dstdims); -		if (!ok) ASTRA_ERROR("Error copying output data from GPU"); -		 -		ok = astraCUDA3d::freeGPUMemory(outputMem); -		if (!ok) ASTRA_ERROR("Error freeing GPU memory");  	}  	return true; @@ -1000,6 +1423,26 @@ bool CCompositeGeometryManager::doJobs(TJobList &jobs) + +//static +void CCompositeGeometryManager::setGlobalGPUParams(const SGPUParams& params) +{ +	delete s_params; + +	s_params = new SGPUParams; +	*s_params = params; + +	ASTRA_DEBUG("CompositeGeometryManager: Setting global GPU params:"); +	std::ostringstream s; +	s << "GPU indices:"; +	for (unsigned int i = 0; i < params.GPUIndices.size(); ++i) +		s << " " << params.GPUIndices[i]; +	std::string ss = s.str(); +	ASTRA_DEBUG(ss.c_str()); +	ASTRA_DEBUG("Memory: %llu", params.memory); +} + +  }  #endif diff --git a/src/ConeProjectionGeometry3D.cpp b/src/ConeProjectionGeometry3D.cpp index 99b4bf4..96b04fb 100644 --- a/src/ConeProjectionGeometry3D.cpp +++ b/src/ConeProjectionGeometry3D.cpp @@ -256,9 +256,6 @@ void CConeProjectionGeometry3D::projectPoint(double fX, double fY, double fZ,  	// Scale fS to detector plane  	fU = detectorOffsetXToColIndexFloat( (fS * (m_fOriginSourceDistance + m_fOriginDetectorDistance)) / fD ); - -	ASTRA_DEBUG("alpha: %f, D: %f, V: %f, S: %f, U: %f", alpha, fD, fV, fS, fU); -  }  void CConeProjectionGeometry3D::backprojectPointX(int iAngleIndex, double fU, double fV, | 
