/* ----------------------------------------------------------------------- Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp 2014-2015, CWI, Amsterdam Contact: astra@uantwerpen.be Website: http://sf.net/projects/astra-toolbox This file is part of the ASTRA Toolbox. The ASTRA Toolbox is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The ASTRA Toolbox is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with the ASTRA Toolbox. If not, see . ----------------------------------------------------------------------- $Id$ */ #include #include #include #include #include "util.h" #include "arith.h" #ifdef STANDALONE #include "testutil.h" #endif #define PIXELTRACE typedef texture texture2D; static texture2D gT_volumeTexture; namespace astraCUDA { static const unsigned g_MaxAngles = 2560; __constant__ float gC_angle[g_MaxAngles]; __constant__ float gC_angle_offset[g_MaxAngles]; // optimization parameters static const unsigned int g_anglesPerBlock = 16; static const unsigned int g_detBlockSize = 32; static const unsigned int g_blockSlices = 64; // fixed point scaling factor #define fPREC_FACTOR 16.0f #define iPREC_FACTOR 16 // if necessary, a buffer of zeroes of size g_MaxAngles static float* g_pfZeroes = 0; static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); dataArray = 0; cudaMallocArray(&dataArray, &channelDesc, width, height); cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); gT_volumeTexture.addressMode[0] = cudaAddressModeBorder; gT_volumeTexture.addressMode[1] = cudaAddressModeBorder; gT_volumeTexture.filterMode = cudaFilterModeLinear; gT_volumeTexture.normalized = false; // TODO: For very small sizes (roughly <=512x128) with few angles (<=180) // not using an array is more efficient. // cudaBindTexture2D(0, gT_volumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); cudaBindTextureToArray(gT_volumeTexture, dataArray, channelDesc); // TODO: error value? return true; } // projection for angles that are roughly horizontal // theta between 45 and 135 degrees __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) { const int relDet = threadIdx.x; const int relAngle = threadIdx.y; int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; if (angle >= endAngle) return; const float theta = gC_angle[angle]; const float cos_theta = __cosf(theta); const float sin_theta = __sinf(theta); // compute start detector for this block/angle: // (The same for all threadIdx.x) // ------------------------------------- const int midSlice = startSlice + g_blockSlices / 2; // ASSUMPTION: fDetScale >= 1.0f // problem: detector regions get skipped because slice blocks aren't large // enough const unsigned int g_blockSliceSize = g_detBlockSize; // project (midSlice,midRegion) on this thread's detector const float fBase = 0.5f*dims.iProjDets - 0.5f + ( (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta + gC_angle_offset[angle] ) / dims.fDetScale; int iBase = (int)floorf(fBase * fPREC_FACTOR); int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR); // ASSUMPTION: 16 > regionOffset / fDetScale const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; if (blockIdx.y > 0 && detRegion == detPrevRegion) return; const int detector = detRegion * g_detBlockSize + relDet; // Now project the part of the ray to angle,detector through // slices startSlice to startSlice+g_blockSlices-1 if (detector < 0 || detector >= dims.iProjDets) return; const float fDetStep = -dims.fDetScale / sin_theta; float fSliceStep = cos_theta / sin_theta; float fDistCorr; if (sin_theta > 0.0f) fDistCorr = -fDetStep; else fDistCorr = fDetStep; fDistCorr *= outputScale; float fVal = 0.0f; // project detector on slice float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; float fS = startSlice + 0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolWidth) endSlice = dims.iVolWidth; if (dims.iRaysPerDet > 1) { fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; const float fSubDetStep = fDetStep / dims.iRaysPerDet; fDistCorr /= dims.iRaysPerDet; fSliceStep -= dims.iRaysPerDet * fSubDetStep; for (int slice = startSlice; slice < endSlice; ++slice) { for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { fVal += tex2D(gT_volumeTexture, fS, fP); fP += fSubDetStep; } fP += fSliceStep; fS += 1.0f; } } else { for (int slice = startSlice; slice < endSlice; ++slice) { fVal += tex2D(gT_volumeTexture, fS, fP); fP += fSliceStep; fS += 1.0f; } } D_projData[angle*projPitch+detector] += fVal * fDistCorr; } // projection for angles that are roughly vertical // theta between 0 and 45, or 135 and 180 degrees __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) { const int relDet = threadIdx.x; const int relAngle = threadIdx.y; int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; if (angle >= endAngle) return; const float theta = gC_angle[angle]; const float cos_theta = __cosf(theta); const float sin_theta = __sinf(theta); // compute start detector for this block/angle: // (The same for all threadIdx.x) // ------------------------------------- const int midSlice = startSlice + g_blockSlices / 2; // project (midSlice,midRegion) on this thread's detector // ASSUMPTION: fDetScale >= 1.0f // problem: detector regions get skipped because slice blocks aren't large // enough const unsigned int g_blockSliceSize = g_detBlockSize; const float fBase = 0.5f*dims.iProjDets - 0.5f + ( (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta + gC_angle_offset[angle] ) / dims.fDetScale; int iBase = (int)floorf(fBase * fPREC_FACTOR); int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR); // ASSUMPTION: 16 > regionOffset / fDetScale const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; if (blockIdx.y > 0 && detRegion == detPrevRegion) return; const int detector = detRegion * g_detBlockSize + relDet; // Now project the part of the ray to angle,detector through // slices startSlice to startSlice+g_blockSlices-1 if (detector < 0 || detector >= dims.iProjDets) return; const float fDetStep = dims.fDetScale / cos_theta; float fSliceStep = sin_theta / cos_theta; float fDistCorr; if (cos_theta < 0.0f) fDistCorr = -fDetStep; else fDistCorr = fDetStep; fDistCorr *= outputScale; float fVal = 0.0f; float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; float fS = startSlice+0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolHeight) endSlice = dims.iVolHeight; if (dims.iRaysPerDet > 1) { fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; const float fSubDetStep = fDetStep / dims.iRaysPerDet; fDistCorr /= dims.iRaysPerDet; fSliceStep -= dims.iRaysPerDet * fSubDetStep; for (int slice = startSlice; slice < endSlice; ++slice) { for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { fVal += tex2D(gT_volumeTexture, fP, fS); fP += fSubDetStep; } fP += fSliceStep; fS += 1.0f; } } else { for (int slice = startSlice; slice < endSlice; ++slice) { fVal += tex2D(gT_volumeTexture, fP, fS); fP += fSliceStep; fS += 1.0f; } } D_projData[angle*projPitch+detector] += fVal * fDistCorr; } // projection for angles that are roughly horizontal // (detector roughly vertical) __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) { const int relDet = threadIdx.x; const int relAngle = threadIdx.y; int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; if (angle >= endAngle) return; const float theta = gC_angle[angle]; const float cos_theta = __cosf(theta); const float sin_theta = __sinf(theta); // compute start detector for this block/angle: const int detRegion = blockIdx.y; const int detector = detRegion * g_detBlockSize + relDet; // Now project the part of the ray to angle,detector through // slices startSlice to startSlice+g_blockSlices-1 if (detector < 0 || detector >= dims.iProjDets) return; const float fDetStep = -dims.fDetScale / sin_theta; float fSliceStep = cos_theta / sin_theta; float fDistCorr; if (sin_theta > 0.0f) fDistCorr = -fDetStep; else fDistCorr = fDetStep; fDistCorr *= outputScale; float fVal = 0.0f; // project detector on slice float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; float fS = startSlice + 0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolWidth) endSlice = dims.iVolWidth; if (dims.iRaysPerDet > 1) { fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; const float fSubDetStep = fDetStep / dims.iRaysPerDet; fDistCorr /= dims.iRaysPerDet; fSliceStep -= dims.iRaysPerDet * fSubDetStep; for (int slice = startSlice; slice < endSlice; ++slice) { for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { fVal += tex2D(gT_volumeTexture, fS, fP); fP += fSubDetStep; } fP += fSliceStep; fS += 1.0f; } } else { for (int slice = startSlice; slice < endSlice; ++slice) { fVal += tex2D(gT_volumeTexture, fS, fP); fP += fSliceStep; fS += 1.0f; } } D_projData[angle*projPitch+detector] += fVal * fDistCorr; } // projection for angles that are roughly vertical // (detector roughly horizontal) __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) { const int relDet = threadIdx.x; const int relAngle = threadIdx.y; int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; if (angle >= endAngle) return; const float theta = gC_angle[angle]; const float cos_theta = __cosf(theta); const float sin_theta = __sinf(theta); // compute start detector for this block/angle: const int detRegion = blockIdx.y; const int detector = detRegion * g_detBlockSize + relDet; // Now project the part of the ray to angle,detector through // slices startSlice to startSlice+g_blockSlices-1 if (detector < 0 || detector >= dims.iProjDets) return; const float fDetStep = dims.fDetScale / cos_theta; float fSliceStep = sin_theta / cos_theta; float fDistCorr; if (cos_theta < 0.0f) fDistCorr = -fDetStep; else fDistCorr = fDetStep; fDistCorr *= outputScale; float fVal = 0.0f; float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; float fS = startSlice+0.5f; int endSlice = startSlice + g_blockSlices; if (endSlice > dims.iVolHeight) endSlice = dims.iVolHeight; if (dims.iRaysPerDet > 1) { fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; const float fSubDetStep = fDetStep / dims.iRaysPerDet; fDistCorr /= dims.iRaysPerDet; fSliceStep -= dims.iRaysPerDet * fSubDetStep; for (int slice = startSlice; slice < endSlice; ++slice) { for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { fVal += tex2D(gT_volumeTexture, fP, fS); fP += fSubDetStep; } fP += fSliceStep; fS += 1.0f; } } else { for (int slice = startSlice; slice < endSlice; ++slice) { fVal += tex2D(gT_volumeTexture, fP, fS); fP += fSliceStep; fS += 1.0f; } } D_projData[angle*projPitch+detector] += fVal * fDistCorr; } bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, const SDimensions& dims, const float* angles, const float* TOffsets, float outputScale) { // TODO: load angles into constant memory in smaller blocks assert(dims.iProjAngles <= g_MaxAngles); cudaArray* D_dataArray; bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); if (TOffsets) { cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } else { if (!g_pfZeroes) { g_pfZeroes = new float[g_MaxAngles]; memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); } cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles std::list streams; // Run over all angles, grouping them into groups of the same // orientation (roughly horizontal vs. roughly vertical). // Start a stream of grids for each such group. // TODO: Check if it's worth it to store this info instead // of recomputing it every FP. unsigned int blockStart = 0; unsigned int blockEnd = 0; bool blockVertical = false; for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { bool vertical = false; // TODO: Having <= instead of < below causes a 5% speedup. // Maybe we should detect corner cases and put them in the optimal // group of angles. if (a != dims.iProjAngles) vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); if (a == dims.iProjAngles || vertical != blockVertical) { // block done blockEnd = a; if (blockStart != blockEnd) { dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks // TODO: check if we can't immediately // destroy the stream after use cudaStream_t stream; cudaStreamCreate(&stream); streams.push_back(stream); //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); if (!blockVertical) for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) FPhorizontal_simple<<>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); else for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) FPvertical_simple<<>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); } blockVertical = vertical; blockStart = a; } } for (std::list::iterator iter = streams.begin(); iter != streams.end(); ++iter) cudaStreamDestroy(*iter); streams.clear(); cudaThreadSynchronize(); cudaTextForceKernelsCompletion(); cudaFreeArray(D_dataArray); return true; } bool FP_simple(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, const SDimensions& dims, const float* angles, const float* TOffsets, float outputScale) { for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { SDimensions subdims = dims; unsigned int iEndAngle = iAngle + g_MaxAngles; if (iEndAngle >= dims.iProjAngles) iEndAngle = dims.iProjAngles; subdims.iProjAngles = iEndAngle - iAngle; bool ret; ret = FP_simple_internal(D_volumeData, volumePitch, D_projData + iAngle * projPitch, projPitch, subdims, angles + iAngle, TOffsets ? TOffsets + iAngle : 0, outputScale); if (!ret) return false; } return true; } bool FP(float* D_volumeData, unsigned int volumePitch, float* D_projData, unsigned int projPitch, const SDimensions& dims, const float* angles, const float* TOffsets, float outputScale) { return FP_simple(D_volumeData, volumePitch, D_projData, projPitch, dims, angles, TOffsets, outputScale); // TODO: Fix bug in this non-simple FP with large detscale and TOffsets #if 0 // TODO: load angles into constant memory in smaller blocks assert(dims.iProjAngles <= g_MaxAngles); // TODO: compute region size dynamically to resolve these two assumptions // ASSUMPTION: 16 > regionOffset / fDetScale const unsigned int g_blockSliceSize = g_detBlockSize; assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale); // ASSUMPTION: fDetScale >= 1.0f assert(dims.fDetScale > 0.9999f); cudaArray* D_dataArray; bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight); cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); if (TOffsets) { cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } else { if (!g_pfZeroes) { g_pfZeroes = new float[g_MaxAngles]; memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); } cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } int regionOffset = g_blockSlices / g_blockSliceSize; dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles std::list streams; // Run over all angles, grouping them into groups of the same // orientation (roughly horizontal vs. roughly vertical). // Start a stream of grids for each such group. // TODO: Check if it's worth it to store this info instead // of recomputing it every FP. unsigned int blockStart = 0; unsigned int blockEnd = 0; bool blockVertical = false; for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { bool vertical; // TODO: Having <= instead of < below causes a 5% speedup. // Maybe we should detect corner cases and put them in the optimal // group of angles. if (a != dims.iProjAngles) vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); if (a == dims.iProjAngles || vertical != blockVertical) { // block done blockEnd = a; if (blockStart != blockEnd) { unsigned int length = dims.iVolHeight; if (blockVertical) length = dims.iVolWidth; dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions // TODO: check if we can't immediately // destroy the stream after use cudaStream_t stream; cudaStreamCreate(&stream); streams.push_back(stream); //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); if (!blockVertical) for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) FPhorizontal<<>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); else for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) FPvertical<<>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); } blockVertical = vertical; blockStart = a; } } for (std::list::iterator iter = streams.begin(); iter != streams.end(); ++iter) cudaStreamDestroy(*iter); streams.clear(); cudaThreadSynchronize(); cudaTextForceKernelsCompletion(); cudaFreeArray(D_dataArray); return true; #endif } } #ifdef STANDALONE using namespace astraCUDA; int main() { float* D_volumeData; float* D_projData; SDimensions dims; dims.iVolWidth = 1024; dims.iVolHeight = 1024; dims.iProjAngles = 512; dims.iProjDets = 1536; dims.fDetScale = 1.0f; dims.iRaysPerDet = 1; unsigned int volumePitch, projPitch; allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); printf("pitch: %u\n", volumePitch); allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); printf("pitch: %u\n", projPitch); unsigned int y, x; float* img = loadImage("phantom.png", y, x); float* sino = new float[dims.iProjAngles * dims.iProjDets]; memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float)); copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); float* angle = new float[dims.iProjAngles]; for (unsigned int i = 0; i < dims.iProjAngles; ++i) angle[i] = i*(M_PI/dims.iProjAngles); FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f); delete[] angle; copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); float s = 0.0f; for (unsigned int y = 0; y < dims.iProjAngles; ++y) for (unsigned int x = 0; x < dims.iProjDets; ++x) s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; printf("cpu norm: %f\n", s); //zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); printf("gpu norm: %f\n", s); saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); return 0; } #endif