// This file is part of VkFFT, a Vulkan Fast Fourier Transform library
//
// Copyright (C) 2020 - present Dmitrii Tolmachev <dtolm96@gmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef VKFFT_H
#define VKFFT_H

#include <locale.h>
#include <memory.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#if(VKFFT_BACKEND==0)
#include "vulkan/vulkan.h"
#include "glslang_c_interface.h"
#elif(VKFFT_BACKEND==1)
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cuComplex.h>
#elif(VKFFT_BACKEND==2)
#include <hip/hiprtc.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hip/hip_complex.h>
#elif(VKFFT_BACKEND==3)
#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
#endif
#ifdef VkFFT_use_FP128_Bluestein_RaderFFT
#include "fftw3.h"
#endif
typedef struct {
	//WHDCN layout

	//required parameters:
	uint64_t FFTdim; //FFT dimensionality (1, 2 or 3)
	uint64_t size[3]; // WHD -system dimensions

#if(VKFFT_BACKEND==0)
	VkPhysicalDevice* physicalDevice;//pointer to Vulkan physical device, obtained from vkEnumeratePhysicalDevices
	VkDevice* device;//pointer to Vulkan device, created with vkCreateDevice
	VkQueue* queue;//pointer to Vulkan queue, created with vkGetDeviceQueue
	VkCommandPool* commandPool;//pointer to Vulkan command pool, created with vkCreateCommandPool
	VkFence* fence;//pointer to Vulkan fence, created with vkCreateFence
	uint64_t isCompilerInitialized;//specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0
#elif(VKFFT_BACKEND==1)
	CUdevice* device;//pointer to CUDA device, obtained from cuDeviceGet
	//CUcontext* context;//pointer to CUDA context, obtained from cuDeviceGet
	cudaStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels
	uint64_t num_streams;//try to submit CUDA kernels in multiple streams for asynchronous execution. Default 1
#elif(VKFFT_BACKEND==2)
	hipDevice_t* device;//pointer to HIP device, obtained from hipDeviceGet
	//hipCtx_t* context;//pointer to HIP context, obtained from hipDeviceGet
	hipStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels
	uint64_t num_streams;//try to submit HIP kernels in multiple streams for asynchronous execution. Default 1
#elif(VKFFT_BACKEND==3)
	cl_platform_id* platform;//not required
	cl_device_id* device;
	cl_context* context;
#elif(VKFFT_BACKEND==4)
	ze_device_handle_t* device;
	ze_context_handle_t* context;
	ze_command_queue_handle_t* commandQueue;
	uint32_t commandQueueID;
#endif

	//data parameters:
	uint64_t userTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on)

	uint64_t bufferNum;//multiple buffer sequence storage is Vulkan only. Default 1
	uint64_t tempBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation
	uint64_t inputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled
	uint64_t outputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled
	uint64_t kernelNum;//multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled

	//sizes are obligatory in Vulkan backend, optional in others
	uint64_t* bufferSize;//array of buffers sizes in bytes
	uint64_t* tempBufferSize;//array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation
	uint64_t* inputBufferSize;//array of input buffers sizes in bytes, if isInputFormatted is enabled
	uint64_t* outputBufferSize;//array of output buffers sizes in bytes, if isOutputFormatted is enabled
	uint64_t* kernelSize;//array of kernel buffers sizes in bytes, if performConvolution is enabled

#if(VKFFT_BACKEND==0)
	VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations
	VkBuffer* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation
	VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled
	VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled
	VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==1)
	void** buffer;//pointer to device buffer used for computations
	void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==2)
	void** buffer;//pointer to device buffer used for computations
	void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==3)
	cl_mem* buffer;//pointer to device buffer used for computations
	cl_mem* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	cl_mem* inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	cl_mem* outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	cl_mem* kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==4)
	void** buffer;//pointer to device buffer used for computations
	void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#endif
	uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 
	uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 
	uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 
	uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
	uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
	uint64_t specifyOffsetsAtLaunch;//specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0

	//optional: (default 0 if not stated otherwise)
#if(VKFFT_BACKEND==0)
	VkPipelineCache* pipelineCache;//pointer to Vulkan pipeline cache
#endif
	uint64_t coalescedMemory;//in bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Gonna work regardles, but if specified by user correctly, the performance will be higher.
	uint64_t aimThreads;//aim at this many threads per block. Default 128
	uint64_t numSharedBanks;//how many banks shared memory has. Default 32
	uint64_t inverseReturnToInputBuffer;//return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled
	uint64_t numberBatches;// N - used to perform multiple batches of initial data. Default 1
	uint64_t useUint64;// use 64-bit addressing mode in generated kernels
	uint64_t omitDimension[3];//disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C dimension 0 for now. Doesn't work with convolutions.
	uint64_t performBandwidthBoost;//try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise 

	uint64_t doublePrecision; //perform calculations in double precision (0 - off, 1 - on).
	uint64_t halfPrecision; //perform calculations in half precision (0 - off, 1 - on)
	uint64_t halfPrecisionMemoryOnly; //use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out of place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on)
	uint64_t doublePrecisionFloatMemory; //use FP64 precision for all calculations, while all memory storage is done in FP32.

	uint64_t performR2C; //perform R2C/C2R decomposition (0 - off, 1 - on)
	uint64_t performDCT; //perform DCT transformation (X - DCT type, 1-4)
	uint64_t disableMergeSequencesR2C; //disable merging of two real sequences to reduce calculations (0 - off, 1 - on)
	uint64_t normalize; //normalize inverse transform (0 - off, 1 - on)
	uint64_t disableReorderFourStep; // disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
	uint64_t useLUT; //switches from calculating sincos to using precomputed LUT tables (0 - off, 1 - on). Configured by initialization routine
	uint64_t makeForwardPlanOnly; //generate code only for forward FFT (0 - off, 1 - on)
	uint64_t makeInversePlanOnly; //generate code only for inverse FFT (0 - off, 1 - on)

	uint64_t bufferStride[3];//buffer strides - default set to x - x*y - x*y*z values
	uint64_t isInputFormatted; //specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
	uint64_t isOutputFormatted; //specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1)
	uint64_t inputBufferStride[3];//input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values
	uint64_t outputBufferStride[3];//output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values

	uint64_t considerAllAxesStrided;//will create plan for nonstrided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on)
	uint64_t keepShaderCode;//will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on)
	uint64_t printMemoryLayout;//will print order of buffers used in shaders (0 - off, 1 - on)

	uint64_t saveApplicationToString;//will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). (0 - off, 1 - on)

	uint64_t loadApplicationFromString;//will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). (0 - off, 1 - on). Mutually exclusive with saveApplicationToString
	void* loadApplicationString;//memory binary array through which user can load VkFFT binaries, must be provided by user if loadApplicationFromString = 1. Use rb/wb flags to load/save.

	uint64_t disableSetLocale;//disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0.

	//optional Bluestein optimizations: (default 0 if not stated otherwise)
	uint64_t fixMaxRadixBluestein;//controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13.
	uint64_t forceBluesteinSequenceSize;// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13.
	uint64_t useCustomBluesteinPaddingPattern;// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. 
											  // paddedSizes - array of lengths to pad to. paddedSizes[i] will be the padding size for all non-decomposable sequences from primeSizes[i] to primeSizes[i+1] (will use default scheme after last one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme (for example). Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern.
	uint64_t* primeSizes; // described in useCustomBluesteinPaddingPattern
	uint64_t* paddedSizes; // described in useCustomBluesteinPaddingPattern

	uint64_t fixMinRaderPrimeMult;//start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader.
	uint64_t fixMaxRaderPrimeMult;//switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40)

	uint64_t fixMinRaderPrimeFFT;//start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. 
	uint64_t fixMaxRaderPrimeFFT;//switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory.

	//optional zero padding control parameters: (default 0 if not stated otherwise)
	uint64_t performZeropadding[3]; // don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on)
	uint64_t fft_zeropad_left[3];//specify start boundary of zero block in the system for each axis
	uint64_t fft_zeropad_right[3];//specify end boundary of zero block in the system for each axis
	uint64_t frequencyZeroPadding; //set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding

	//optional convolution control parameters: (default 0 if not stated otherwise)
	uint64_t performConvolution; //perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter
	uint64_t conjugateConvolution;//0 off, 1 - conjugation of the sequence FFT is currently done on, 2 - conjugation of the convolution kernel
	uint64_t crossPowerSpectrumNormalization;//normalize the FFT x kernel multiplication in frequency domain
	uint64_t coordinateFeatures; // C - coordinate, or dimension of features vector. In matrix convolution - size of vector
	uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
	uint64_t symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric
	uint64_t numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output
	uint64_t kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation

	//register overutilization (experimental): (default 0 if not stated otherwise)
	uint64_t registerBoost; //specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Default 1
	uint64_t registerBoostNonPow2; //specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on)
	uint64_t registerBoost4Step; //specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1

	//not used techniques:
	uint64_t swapTo3Stage4Step; //specify at which power of 2 to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 17
	uint64_t devicePageSize;//in KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages
	uint64_t localPageSize;//in KB, the size to split page into if sequence spans multiple devicePageSize pages

	//automatically filled based on device info (still can be reconfigured by user):
	uint64_t computeCapabilityMajor; // CUDA/HIP compute capability of the device
	uint64_t computeCapabilityMinor; // CUDA/HIP compute capability of the device
	uint64_t maxComputeWorkGroupCount[3]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits
	uint64_t maxComputeWorkGroupSize[3]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits
	uint64_t maxThreadsNum; //max number of threads from VkPhysicalDeviceLimits
	uint64_t sharedMemorySizeStatic; //available for static allocation shared memory size, in bytes
	uint64_t sharedMemorySize; //available for allocation shared memory size, in bytes
	uint64_t sharedMemorySizePow2; //power of 2 which is less or equal to sharedMemorySize, in bytes
	uint64_t warpSize; //number of threads per warp/wavefront.
	uint64_t halfThreads;//Intel fix
	uint64_t allocateTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated
	uint64_t reorderFourStep; // unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1.
	int64_t maxCodeLength; //specify how big can be buffer used for code generation (in char). Default 4000000 chars.
	int64_t maxTempLength; //specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number.
	uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern
	uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow
	uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc.
#if(VKFFT_BACKEND==0)
	VkDeviceMemory tempBufferDeviceMemory;//Filled at app creation
	VkCommandBuffer* commandBuffer;//Filled at app execution
	VkMemoryBarrier* memory_barrier;//Filled at app creation
#elif(VKFFT_BACKEND==1)
	cudaEvent_t* stream_event;//Filled at app creation
	uint64_t streamCounter;//Filled at app creation
	uint64_t streamID;//Filled at app creation
#elif(VKFFT_BACKEND==2)
	hipEvent_t* stream_event;//Filled at app creation
	uint64_t streamCounter;//Filled at app creation
	uint64_t streamID;//Filled at app creation
#elif(VKFFT_BACKEND==3)
	cl_command_queue* commandQueue;
#elif(VKFFT_BACKEND==4)
	ze_command_list_handle_t* commandList;//Filled at app execution
#endif
} VkFFTConfiguration;//parameters specified at plan creation

typedef struct {
#if(VKFFT_BACKEND==0)
	VkCommandBuffer* commandBuffer;//commandBuffer to which FFT is appended

	VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations
	VkBuffer* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation
	VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled
	VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled
	VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==1)
	void** buffer;//pointer to device buffer used for computations
	void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==2)
	void** buffer;//pointer to device buffer used for computations
	void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==3)
	cl_command_queue* commandQueue;//commandBuffer to which FFT is appended

	cl_mem* buffer;//pointer to device buffer used for computations
	cl_mem* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	cl_mem* inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	cl_mem* outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	cl_mem* kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#elif(VKFFT_BACKEND==4)
	ze_command_list_handle_t* commandList;//commandList to which FFT is appended

	void** buffer;//pointer to device buffer used for computations
	void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation
	void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled
	void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled
	void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled
#endif
	//following parameters can be specified during kernels launch, if specifyOffsetsAtLaunch parameter was enabled during the initializeVkFFT call
	uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 
	uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 
	uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 
	uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0
	uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0
} VkFFTLaunchParams;//parameters specified at plan execution
typedef enum VkFFTResult {
	VKFFT_SUCCESS = 0,
	VKFFT_ERROR_MALLOC_FAILED = 1,
	VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER = 2,
	VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER = 3,
	VKFFT_ERROR_PLAN_NOT_INITIALIZED = 4,
	VKFFT_ERROR_NULL_TEMP_PASSED = 5,
	VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001,
	VKFFT_ERROR_INVALID_DEVICE = 1002,
	VKFFT_ERROR_INVALID_QUEUE = 1003,
	VKFFT_ERROR_INVALID_COMMAND_POOL = 1004,
	VKFFT_ERROR_INVALID_FENCE = 1005,
	VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED = 1006,
	VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED = 1007,
	VKFFT_ERROR_INVALID_CONTEXT = 1008,
	VKFFT_ERROR_INVALID_PLATFORM = 1009,
	VKFFT_ERROR_ENABLED_saveApplicationToString = 1010,
	VKFFT_ERROR_EMPTY_FILE = 1011,
	VKFFT_ERROR_EMPTY_FFTdim = 2001,
	VKFFT_ERROR_EMPTY_size = 2002,
	VKFFT_ERROR_EMPTY_bufferSize = 2003,
	VKFFT_ERROR_EMPTY_buffer = 2004,
	VKFFT_ERROR_EMPTY_tempBufferSize = 2005,
	VKFFT_ERROR_EMPTY_tempBuffer = 2006,
	VKFFT_ERROR_EMPTY_inputBufferSize = 2007,
	VKFFT_ERROR_EMPTY_inputBuffer = 2008,
	VKFFT_ERROR_EMPTY_outputBufferSize = 2009,
	VKFFT_ERROR_EMPTY_outputBuffer = 2010,
	VKFFT_ERROR_EMPTY_kernelSize = 2011,
	VKFFT_ERROR_EMPTY_kernel = 2012,
	VKFFT_ERROR_EMPTY_applicationString = 2013,
	VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays = 2014,
	VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,
	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002,
	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,
	VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004,
	VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005,
	VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001,
	VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002,
	VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS = 4003,
	VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER = 4004,
	VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER = 4005,
	VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE = 4006,
	VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES = 4007,
	VKFFT_ERROR_FAILED_TO_RESET_FENCES = 4008,
	VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL = 4009,
	VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT = 4010,
	VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS = 4011,
	VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT = 4012,
	VKFFT_ERROR_FAILED_SHADER_PREPROCESS = 4013,
	VKFFT_ERROR_FAILED_SHADER_PARSE = 4014,
	VKFFT_ERROR_FAILED_SHADER_LINK = 4015,
	VKFFT_ERROR_FAILED_SPIRV_GENERATE = 4016,
	VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE = 4017,
	VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE = 4018,
	VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER = 4019,
	VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE = 4020,
	VKFFT_ERROR_FAILED_TO_CREATE_DEVICE = 4021,
	VKFFT_ERROR_FAILED_TO_CREATE_FENCE = 4022,
	VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL = 4023,
	VKFFT_ERROR_FAILED_TO_CREATE_BUFFER = 4024,
	VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY = 4025,
	VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY = 4026,
	VKFFT_ERROR_FAILED_TO_FIND_MEMORY = 4027,
	VKFFT_ERROR_FAILED_TO_SYNCHRONIZE = 4028,
	VKFFT_ERROR_FAILED_TO_COPY = 4029,
	VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM = 4030,
	VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM = 4031,
	VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE = 4032,
	VKFFT_ERROR_FAILED_TO_GET_CODE = 4033,
	VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM = 4034,
	VKFFT_ERROR_FAILED_TO_LOAD_MODULE = 4035,
	VKFFT_ERROR_FAILED_TO_GET_FUNCTION = 4036,
	VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY = 4037,
	VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL = 4038,
	VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL = 4039,
	VKFFT_ERROR_FAILED_TO_EVENT_RECORD = 4040,
	VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION = 4041,
	VKFFT_ERROR_FAILED_TO_INITIALIZE = 4042,
	VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID = 4043,
	VKFFT_ERROR_FAILED_TO_GET_DEVICE = 4044,
	VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT = 4045,
	VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE = 4046,
	VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG = 4047,
	VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE = 4048,
	VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE = 4049,
	VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES = 4050,
	VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE = 4051,
	VKFFT_ERROR_FAILED_TO_CREATE_EVENT = 4052,
	VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST = 4053,
	VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST = 4054,
	VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER = 4055
} VkFFTResult;

static inline const char* getVkFFTErrorString(VkFFTResult result)
{
	switch (result)
	{
	case VKFFT_SUCCESS:
		return "VKFFT_SUCCESS";
	case VKFFT_ERROR_MALLOC_FAILED:
		return "VKFFT_ERROR_MALLOC_FAILED";
	case VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER:
		return "VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER";
	case VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER:
		return "VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER";
	case VKFFT_ERROR_PLAN_NOT_INITIALIZED:
		return "VKFFT_ERROR_PLAN_NOT_INITIALIZED";
	case VKFFT_ERROR_NULL_TEMP_PASSED:
		return "VKFFT_ERROR_NULL_TEMP_PASSED";
	case VKFFT_ERROR_INVALID_PHYSICAL_DEVICE:
		return "VKFFT_ERROR_INVALID_PHYSICAL_DEVICE";
	case VKFFT_ERROR_INVALID_DEVICE:
		return "VKFFT_ERROR_INVALID_DEVICE";
	case VKFFT_ERROR_INVALID_QUEUE:
		return "VKFFT_ERROR_INVALID_QUEUE";
	case VKFFT_ERROR_INVALID_COMMAND_POOL:
		return "VKFFT_ERROR_INVALID_COMMAND_POOL";
	case VKFFT_ERROR_INVALID_FENCE:
		return "VKFFT_ERROR_INVALID_FENCE";
	case VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED:
		return "VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED";
	case VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED:
		return "VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED";
	case VKFFT_ERROR_INVALID_CONTEXT:
		return "VKFFT_ERROR_INVALID_CONTEXT";
	case VKFFT_ERROR_INVALID_PLATFORM:
		return "VKFFT_ERROR_INVALID_PLATFORM";
	case VKFFT_ERROR_ENABLED_saveApplicationToString:
		return "VKFFT_ERROR_ENABLED_saveApplicationToString";
	case VKFFT_ERROR_EMPTY_FILE:
		return "VKFFT_ERROR_EMPTY_FILE";
	case VKFFT_ERROR_EMPTY_FFTdim:
		return "VKFFT_ERROR_EMPTY_FFTdim";
	case VKFFT_ERROR_EMPTY_size:
		return "VKFFT_ERROR_EMPTY_size";
	case VKFFT_ERROR_EMPTY_bufferSize:
		return "VKFFT_ERROR_EMPTY_bufferSize";
	case VKFFT_ERROR_EMPTY_buffer:
		return "VKFFT_ERROR_EMPTY_buffer";
	case VKFFT_ERROR_EMPTY_tempBufferSize:
		return "VKFFT_ERROR_EMPTY_tempBufferSize";
	case VKFFT_ERROR_EMPTY_tempBuffer:
		return "VKFFT_ERROR_EMPTY_tempBuffer";
	case VKFFT_ERROR_EMPTY_inputBufferSize:
		return "VKFFT_ERROR_EMPTY_inputBufferSize";
	case VKFFT_ERROR_EMPTY_inputBuffer:
		return "VKFFT_ERROR_EMPTY_inputBuffer";
	case VKFFT_ERROR_EMPTY_outputBufferSize:
		return "VKFFT_ERROR_EMPTY_outputBufferSize";
	case VKFFT_ERROR_EMPTY_outputBuffer:
		return "VKFFT_ERROR_EMPTY_outputBuffer";
	case VKFFT_ERROR_EMPTY_kernelSize:
		return "VKFFT_ERROR_EMPTY_kernelSize";
	case VKFFT_ERROR_EMPTY_kernel:
		return "VKFFT_ERROR_EMPTY_kernel";
	case VKFFT_ERROR_EMPTY_applicationString:
		return "VKFFT_ERROR_EMPTY_applicationString";
	case VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays:
		return "VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays";
	case VKFFT_ERROR_UNSUPPORTED_RADIX:
		return "VKFFT_ERROR_UNSUPPORTED_RADIX";
	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH:
		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH";
	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C:
		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C";
	case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT:
		return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT";
	case VKFFT_ERROR_UNSUPPORTED_FFT_OMIT:
		return "VKFFT_ERROR_UNSUPPORTED_FFT_OMIT";
	case VKFFT_ERROR_FAILED_TO_ALLOCATE:
		return "VKFFT_ERROR_FAILED_TO_ALLOCATE";
	case VKFFT_ERROR_FAILED_TO_MAP_MEMORY:
		return "VKFFT_ERROR_FAILED_TO_MAP_MEMORY";
	case VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS:
		return "VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS";
	case VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER:
		return "VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER";
	case VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER:
		return "VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER";
	case VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE:
		return "VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE";
	case VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES:
		return "VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES";
	case VKFFT_ERROR_FAILED_TO_RESET_FENCES:
		return "VKFFT_ERROR_FAILED_TO_RESET_FENCES";
	case VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL:
		return "VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL";
	case VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT:
		return "VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT";
	case VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS:
		return "VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS";
	case VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT:
		return "VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT";
	case VKFFT_ERROR_FAILED_SHADER_PREPROCESS:
		return "VKFFT_ERROR_FAILED_SHADER_PREPROCESS";
	case VKFFT_ERROR_FAILED_SHADER_PARSE:
		return "VKFFT_ERROR_FAILED_SHADER_PARSE";
	case VKFFT_ERROR_FAILED_SHADER_LINK:
		return "VKFFT_ERROR_FAILED_SHADER_LINK";
	case VKFFT_ERROR_FAILED_SPIRV_GENERATE:
		return "VKFFT_ERROR_FAILED_SPIRV_GENERATE";
	case VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE:
		return "VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE";
	case VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE:
		return "VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE";
	case VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER:
		return "VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER";
	case VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE:
		return "VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE";
	case VKFFT_ERROR_FAILED_TO_CREATE_DEVICE:
		return "VKFFT_ERROR_FAILED_TO_CREATE_DEVICE";
	case VKFFT_ERROR_FAILED_TO_CREATE_FENCE:
		return "VKFFT_ERROR_FAILED_TO_CREATE_FENCE";
	case VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL:
		return "VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL";
	case VKFFT_ERROR_FAILED_TO_CREATE_BUFFER:
		return "VKFFT_ERROR_FAILED_TO_CREATE_BUFFER";
	case VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY:
		return "VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY";
	case VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY:
		return "VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY";
	case VKFFT_ERROR_FAILED_TO_FIND_MEMORY:
		return "VKFFT_ERROR_FAILED_TO_FIND_MEMORY";
	case VKFFT_ERROR_FAILED_TO_SYNCHRONIZE:
		return "VKFFT_ERROR_FAILED_TO_SYNCHRONIZE";
	case VKFFT_ERROR_FAILED_TO_COPY:
		return "VKFFT_ERROR_FAILED_TO_COPY";
	case VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM:
		return "VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM";
	case VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM:
		return "VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM";
	case VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE:
		return "VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE";
	case VKFFT_ERROR_FAILED_TO_GET_CODE:
		return "VKFFT_ERROR_FAILED_TO_GET_CODE";
	case VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM:
		return "VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM";
	case VKFFT_ERROR_FAILED_TO_LOAD_MODULE:
		return "VKFFT_ERROR_FAILED_TO_LOAD_MODULE";
	case VKFFT_ERROR_FAILED_TO_GET_FUNCTION:
		return "VKFFT_ERROR_FAILED_TO_GET_FUNCTION";
	case VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY:
		return "VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY";
	case VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL:
		return "VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL";
	case VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL:
		return "VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL";
	case VKFFT_ERROR_FAILED_TO_EVENT_RECORD:
		return "VKFFT_ERROR_FAILED_TO_EVENT_RECORD";
	case VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION:
		return "VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION";
	case VKFFT_ERROR_FAILED_TO_INITIALIZE:
		return "VKFFT_ERROR_FAILED_TO_INITIALIZE";
	case VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID:
		return "VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID";
	case VKFFT_ERROR_FAILED_TO_GET_DEVICE:
		return "VKFFT_ERROR_FAILED_TO_GET_DEVICE";
	case VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT:
		return "VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT";
	case VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE:
		return "VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE";
	case VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG:
		return "VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG";
	case VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE:
		return "VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE";
	case VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE:
		return "VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE";
	case VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES:
		return "VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES";
	case VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE:
		return "VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE";
	case VKFFT_ERROR_FAILED_TO_CREATE_EVENT:
		return "VKFFT_ERROR_FAILED_TO_CREATE_EVENT";
	case VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST:
		return "VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST";
	case VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST:
		return "VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST";
	case VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER:
		return "VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER";
	}
	return "Unknown VkFFT error";
}


typedef struct VkFFTRaderContainer VkFFTRaderContainer;

struct VkFFTRaderContainer {
	uint64_t prime;
	uint64_t generator;
	uint64_t multiplier;
	uint64_t inline_rader_g_pow;
	uint64_t raderUintLUToffset;

	uint64_t type; //0 - FFT, 1 - Direct multiplication

	uint64_t raderRegisters;
	uint64_t rader_min_registers;

	//Direct multiplication parameters

	//FFT parameters
	uint64_t registers_per_thread;
	uint64_t min_registers_per_thread;
	uint64_t loc_multipliers[33];
	uint64_t registers_per_thread_per_radix[33];
	uint64_t stageRadix[20];
	uint64_t numStages;
	uint64_t numSubPrimes;
	uint64_t stage_rader_generator[20];
	uint64_t containerFFTDim;
	uint64_t containerFFTNum;
	uint64_t subLogicalGroupSizeMax;//how many threads are needed per Rader transform
	uint64_t RaderKernelOffsetLUT;
	uint64_t RaderRadixOffsetLUT;
	uint64_t RaderRadixOffsetLUTiFFT;
	void* raderFFTkernel;

	struct VkFFTRaderContainer* container;
};

typedef struct {
	uint64_t size[3];
	uint64_t localSize[3];
	uint64_t numSubgroups;
	uint64_t sourceFFTSize;
	uint64_t fftDim;
	uint64_t inverse;
	uint64_t actualInverse;
	uint64_t inverseBluestein;
	uint64_t zeropad[2];
	uint64_t zeropadBluestein[2];
	uint64_t axis_id;
	uint64_t axis_upload_id;
	uint64_t numAxisUploads;
	uint64_t registers_per_thread;
	uint64_t registers_per_thread_per_radix[33];
	uint64_t min_registers_per_thread;
	uint64_t maxNonPow2Radix;
	uint64_t usedLocRegs;
	uint64_t readToRegisters;
	uint64_t writeFromRegisters;
	uint64_t LUT;
	uint64_t raderUintLUT;
	uint64_t useCoalescedLUTUploadToSM;
	uint64_t useBluesteinFFT;
	uint64_t reverseBluesteinMultiUpload;
	uint64_t BluesteinConvolutionStep;
	uint64_t BluesteinPreMultiplication;
	uint64_t BluesteinPostMultiplication;
	uint64_t startDCT3LUT;
	uint64_t startDCT4LUT;
	uint64_t performR2C;
	uint64_t performR2CmultiUpload;
	uint64_t performDCT;
	uint64_t performBandwidthBoost;
	uint64_t frequencyZeropadding;
	uint64_t performZeropaddingFull[3]; // don't do read/write if full sequence is omitted
	uint64_t performZeropaddingInput[3]; // don't read if input is zeropadded (0 - off, 1 - on)
	uint64_t performZeropaddingOutput[3]; // don't write if output is zeropadded (0 - off, 1 - on)
	uint64_t fft_zeropad_left_full[3];
	uint64_t fft_zeropad_left_read[3];
	uint64_t fft_zeropad_left_write[3];
	uint64_t fft_zeropad_right_full[3];
	uint64_t fft_zeropad_right_read[3];
	uint64_t fft_zeropad_right_write[3];
	uint64_t fft_zeropad_Bluestein_left_read[3];
	uint64_t fft_zeropad_Bluestein_left_write[3];
	uint64_t fft_zeropad_Bluestein_right_read[3];
	uint64_t fft_zeropad_Bluestein_right_write[3];
	uint64_t inputStride[5];
	uint64_t outputStride[5];
	uint64_t fft_dim_full;
	uint64_t stageStartSize;
	uint64_t firstStageStartSize;
	uint64_t fft_dim_x;
	uint64_t dispatchZactualFFTSize;
	uint64_t numStages;
	uint64_t stageRadix[33];
	uint64_t inputOffset;
	uint64_t kernelOffset;
	uint64_t outputOffset;
	uint64_t reorderFourStep;
	uint64_t pushConstantsStructSize;
	uint64_t performWorkGroupShift[3];
	uint64_t performPostCompilationInputOffset;
	uint64_t performPostCompilationOutputOffset;
	uint64_t performPostCompilationKernelOffset;
	uint64_t inputBufferBlockNum;
	uint64_t inputBufferBlockSize;
	uint64_t outputBufferBlockNum;
	uint64_t outputBufferBlockSize;
	uint64_t kernelBlockNum;
	uint64_t kernelBlockSize;
	uint64_t numCoordinates;
	uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
	uint64_t numBatches;
	uint64_t numKernels;
	uint64_t conjugateConvolution;
	uint64_t crossPowerSpectrumNormalization;
	uint64_t usedSharedMemory;
	uint64_t sharedMemSize;
	uint64_t sharedMemSizePow2;
	uint64_t normalize;
	uint64_t complexSize;
	uint64_t inputNumberByteSize;
	uint64_t outputNumberByteSize;
	uint64_t kernelNumberByteSize;
	uint64_t maxStageSumLUT;
	uint64_t unroll;
	uint64_t swapComputeWorkGroupID;
	uint64_t convolutionStep;
	uint64_t symmetricKernel;
	uint64_t supportAxis;
	uint64_t cacheShuffle;
	uint64_t registerBoost;
	uint64_t warpSize;
	uint64_t numSharedBanks;
	uint64_t resolveBankConflictFirstStages;
	uint64_t sharedStrideBankConflictFirstStages;
	uint64_t sharedStrideReadWriteConflict;

	uint64_t sharedStrideRaderFFT;
	uint64_t sharedShiftRaderFFT;

	uint64_t maxSharedStride;
	uint64_t axisSwapped;
	uint64_t mergeSequencesR2C;

	uint64_t numBuffersBound[10];
	uint64_t convolutionBindingID;
	uint64_t LUTBindingID;
	uint64_t BluesteinConvolutionBindingID;
	uint64_t BluesteinMultiplicationBindingID;

	uint64_t useRader;
	uint64_t numRaderPrimes;
	uint64_t minRaderFFTThreadNum;
	VkFFTRaderContainer* raderContainer;
	VkFFTRaderContainer* currentRaderContainer;
	uint64_t RaderUintLUTBindingID;

	uint64_t useRaderMult;
	uint64_t additionalRaderSharedSize;
	uint64_t RaderKernelOffsetShared[33];
	uint64_t RaderKernelOffsetLUT[33];
	uint64_t rader_generator[33];
	uint64_t fixMinRaderPrimeMult;//start Rader algorithm for primes from this number
	uint64_t fixMaxRaderPrimeMult;//switch from Rader to Bluestein algorithm for primes from this number
	uint64_t fixMinRaderPrimeFFT;//start Rader algorithm for primes from this number
	uint64_t fixMaxRaderPrimeFFT;//switch from Rader to Bluestein algorithm for primes from this number

	uint64_t inline_rader_g_pow;
	uint64_t inline_rader_kernel;

	uint64_t raderRegisters;
	uint64_t rader_min_registers;

	uint64_t useRaderFFT;

	uint64_t performOffsetUpdate;
	uint64_t performBufferSetUpdate;
	uint64_t useUint64;
	uint64_t disableSetLocale;

	char** regIDs;
	char* disableThreadsStart;
	char* disableThreadsEnd;
	char sdataID[50];
	char inoutID[50];
	char combinedID[50];
	char raderIDx[50];
	char raderIDx2[50];
	char gl_LocalInvocationID_x[50];
	char gl_LocalInvocationID_y[50];
	char gl_LocalInvocationID_z[50];
	char gl_GlobalInvocationID_x[200];
	char gl_GlobalInvocationID_y[200];
	char gl_GlobalInvocationID_z[200];
	char gl_SubgroupInvocationID[200];
	char gl_SubgroupID[200];
	char tshuffle[50];
	char sharedStride[50];
	char gl_WorkGroupSize_x[50];
	char gl_WorkGroupSize_y[50];
	char gl_WorkGroupSize_z[50];
	char gl_WorkGroupID_x[50];
	char gl_WorkGroupID_y[50];
	char gl_WorkGroupID_z[50];
	char tempReg[50];
	char stageInvocationID[50];
	char blockInvocationID[50];
	char temp[50];
	char w[50];
	char iw[50];
	char x0[33][40];
	char locID[33][40];
	char* code0;
	char* output;
	char* tempStr;
	int64_t tempLen;
	int64_t currentLen;
	int64_t maxCodeLength;
	int64_t maxTempLength;
	char oldLocale[100];
} VkFFTSpecializationConstantsLayout;
typedef struct {
	uint32_t dataUint32[10];
	uint64_t dataUint64[10];
	//specify what can be in layout
	uint64_t performWorkGroupShift[3];
	uint64_t workGroupShift[3];

	uint64_t performPostCompilationInputOffset;
	uint64_t inputOffset;

	uint64_t performPostCompilationOutputOffset;
	uint64_t outputOffset;

	uint64_t performPostCompilationKernelOffset;
	uint64_t kernelOffset;

	uint64_t structSize;
} VkFFTPushConstantsLayout;

typedef struct {
	uint64_t numBindings;
	uint64_t axisBlock[4];
	uint64_t groupedBatch;
	VkFFTSpecializationConstantsLayout specializationConstants;
	VkFFTPushConstantsLayout pushConstants;
	uint64_t updatePushConstants;
#if(VKFFT_BACKEND==0)
	VkBuffer* inputBuffer;
	VkBuffer* outputBuffer;
	VkDescriptorPool descriptorPool;
	VkDescriptorSetLayout descriptorSetLayout;
	VkDescriptorSet descriptorSet;
	VkPipelineLayout pipelineLayout;
	VkPipeline pipeline;
	VkDeviceMemory bufferLUTDeviceMemory;
	VkBuffer bufferLUT;
	VkDeviceMemory bufferRaderUintLUTDeviceMemory;
	VkBuffer bufferRaderUintLUT;
	VkDeviceMemory* bufferBluesteinDeviceMemory;
	VkDeviceMemory* bufferBluesteinFFTDeviceMemory;
	VkBuffer* bufferBluestein;
	VkBuffer* bufferBluesteinFFT;
#elif(VKFFT_BACKEND==1)
	void** inputBuffer;
	void** outputBuffer;
	CUmodule VkFFTModule;
	CUfunction VkFFTKernel;
	void* bufferLUT;
	void* bufferRaderUintLUT;
	CUdeviceptr consts_addr;
	void** bufferBluestein;
	void** bufferBluesteinFFT;
#elif(VKFFT_BACKEND==2)
	void** inputBuffer;
	void** outputBuffer;
	hipModule_t VkFFTModule;
	hipFunction_t VkFFTKernel;
	void* bufferLUT;
	void* bufferRaderUintLUT;
	hipDeviceptr_t consts_addr;
	void** bufferBluestein;
	void** bufferBluesteinFFT;
#elif(VKFFT_BACKEND==3)
	cl_mem* inputBuffer;
	cl_mem* outputBuffer;
	cl_program  program;
	cl_kernel kernel;
	cl_mem bufferLUT;
	cl_mem bufferRaderUintLUT;
	cl_mem* bufferBluestein;
	cl_mem* bufferBluesteinFFT;
#elif(VKFFT_BACKEND==4)
	void** inputBuffer;
	void** outputBuffer;
	ze_module_handle_t VkFFTModule;
	ze_kernel_handle_t VkFFTKernel;
	void* bufferLUT;
	void* bufferRaderUintLUT;
	void** bufferBluestein;
	void** bufferBluesteinFFT;
#endif

	void* binary;
	uint64_t binarySize;

	uint64_t bufferLUTSize;
	uint64_t bufferRaderUintLUTSize;
	uint64_t referenceLUT;
} VkFFTAxis;

typedef struct {
	uint64_t actualFFTSizePerAxis[3][3];
	uint64_t numAxisUploads[3];
	uint64_t axisSplit[3][4];
	VkFFTAxis axes[3][4];

	uint64_t multiUploadR2C;
	uint64_t actualPerformR2CPerAxis[3]; // automatically specified, shows if R2C is actually performed or inside FFT or as a separate step
	VkFFTAxis R2Cdecomposition;
	VkFFTAxis inverseBluesteinAxes[3][4];
} VkFFTPlan;
typedef struct {
	VkFFTConfiguration configuration;
	VkFFTPlan* localFFTPlan;
	VkFFTPlan* localFFTPlan_inverse; //additional inverse plan

	uint64_t actualNumBatches;
	uint64_t firstAxis;
	uint64_t lastAxis;
	//Bluestein buffers reused among plans
	uint64_t useBluesteinFFT[3];
#if(VKFFT_BACKEND==0)
	VkDeviceMemory bufferRaderUintLUTDeviceMemory[3][4];
	VkBuffer bufferRaderUintLUT[3][4];
	VkDeviceMemory bufferBluesteinDeviceMemory[3];
	VkDeviceMemory bufferBluesteinFFTDeviceMemory[3];
	VkDeviceMemory bufferBluesteinIFFTDeviceMemory[3];
	VkBuffer bufferBluestein[3];
	VkBuffer bufferBluesteinFFT[3];
	VkBuffer bufferBluesteinIFFT[3];
#elif(VKFFT_BACKEND==1)
	void* bufferRaderUintLUT[3][4];
	void* bufferBluestein[3];
	void* bufferBluesteinFFT[3];
	void* bufferBluesteinIFFT[3];
#elif(VKFFT_BACKEND==2)
	void* bufferRaderUintLUT[3][4];
	void* bufferBluestein[3];
	void* bufferBluesteinFFT[3];
	void* bufferBluesteinIFFT[3];
#elif(VKFFT_BACKEND==3)
	cl_mem bufferRaderUintLUT[3][4];
	cl_mem bufferBluestein[3];
	cl_mem bufferBluesteinFFT[3];
	cl_mem bufferBluesteinIFFT[3];
#elif(VKFFT_BACKEND==4)
	void* bufferRaderUintLUT[3][4];
	void* bufferBluestein[3];
	void* bufferBluesteinFFT[3];
	void* bufferBluesteinIFFT[3];
#endif
	uint64_t bufferRaderUintLUTSize[3][4];
	uint64_t bufferBluesteinSize[3];
	void* applicationBluesteinString[3];
	uint64_t applicationBluesteinStringSize[3];

	uint64_t numRaderFFTPrimes;
	uint64_t rader_primes[30];
	uint64_t rader_buffer_size[30];
	void* raderFFTkernel[30];
	uint64_t applicationStringOffsetRader;

	uint64_t currentApplicationStringPos;

	uint64_t applicationStringSize;//size of saveApplicationString in bytes
	void* saveApplicationString;//memory array(uint32_t* for Vulkan, char* for CUDA/HIP/OpenCL) through which user can access VkFFT generated binaries. (will be allocated by VkFFT, deallocated with deleteVkFFT call)
} VkFFTApplication;

static inline VkFFTResult VkAppendLine(VkFFTSpecializationConstantsLayout* sc) {
	//appends code line stored in tempStr to generated code
	if (sc->tempLen < 0) return VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER;
	if (sc->currentLen + sc->tempLen > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER;
	sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", sc->tempStr);
	return VKFFT_SUCCESS;
}
static inline VkFFTResult VkAppendLineFromInput(VkFFTSpecializationConstantsLayout* sc, const char* in) {
	//appends code line stored in tempStr to generated code
	if (sc->currentLen + (int64_t)strlen(in) > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER;
	sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", in);
	return VKFFT_SUCCESS;
}
static inline VkFFTResult appendLicense(VkFFTSpecializationConstantsLayout* sc) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
// This file is part of VkFFT, a Vulkan Fast Fourier Transform library\n\
//\n\
// Copyright (C) 2020 - present Dmitrii Tolmachev <dtolm96@gmail.com>\n\
//\n\
// Permission is hereby granted, free of charge, to any person obtaining a copy\n\
// of this software and associated documentation files (the \"Software\"), to deal\n\
// in the Software without restriction, including without limitation the rights\n\
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n\
// copies of the Software, and to permit persons to whom the Software is\n\
// furnished to do so, subject to the following conditions:\n\
//\n\
// The above copyright notice and this permission notice shall be included in\n\
// all copies or substantial portions of the Software.\n\
//\n\
// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n\
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n\
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n\
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n\
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n\
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n\
// THE SOFTWARE.\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkMovComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", out, in);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkMovReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", out, in);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkSharedStore(VkFFTSpecializationConstantsLayout* sc, const char* id, const char* in) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	sdata[%s] = %s;\n", id, in);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkSharedLoad(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* id) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = sdata[%s];\n", out, id);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkAddReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s + %s;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkAddComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x + %s.x;\n\
	%s.y = %s.y + %s.y;\n", out, in_1, in_2, out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkAddComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = - %s.x - %s.x;\n\
	%s.y = - %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkAddComplex_x(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x + %s.x;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkAddComplex_y(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.y = %s.y + %s.y;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkSubComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x - %s.x;\n\
	%s.y = %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkSubComplex_x(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x - %s.x;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkSubComplex_y(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.y = %s.y - %s.y;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkSubReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s - %s;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkFMA3Complex(VkFFTSpecializationConstantsLayout* sc, const char* out_1, const char* out_2, const char* in_1, const char* in_num, const char* in_conj) {
	VkFFTResult res = VKFFT_SUCCESS;
	//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, in_1, in_1, in_conj, in_conj);
	//res = VkAppendLine(sc);
	//if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = fma(%s.x, %s.x, %s.x);\n\
	%s.y = fma(%s.y, %s.x, %s.y);\n", out_1, in_1, in_num, out_1, out_1, in_conj, in_num, out_1);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = fma(%s.y, %s.y, %s.x);\n\
	%s.y = fma(%s.x, %s.y, %s.y);\n", out_2, in_1, in_num, out_2, out_2, in_conj, in_num, out_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	/*sc->tempLen = sprintf(sc->tempStr, "\
	temp2.x = fma(%s.x, %s.x, %s.x);\n\
	%s.x = temp2.x;\n\
	temp2.y = fma(%s.y, %s.x, %s.y);\n\
	%s.y = temp2.y;\n", in_1, in_num, out_1, out_1, in_conj, in_num, out_1, out_1);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
	temp2.x = fma(%s.y, %s.y, %s.x);\n\
	%s.x = temp2.x;\n\
	temp2.y = fma(%s.x, %s.y, %s.y);\n\
	%s.y = temp2.y;\n", in_1, in_num, out_2, out_2, in_conj, in_num, out_2, out_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;*/
	//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, out_1, out_1, out_2, out_2);
	//res = VkAppendLine(sc);
	//if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkFMA3Complex_const_w(VkFFTSpecializationConstantsLayout* sc, const char* out_1, const char* out_2, const char* in_1, const char* in_num_x, const char* in_num_y, const char* in_conj) {
	VkFFTResult res = VKFFT_SUCCESS;
	//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, in_1, in_1, in_conj, in_conj);
	//res = VkAppendLine(sc);
	//if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = fma(%s.x, %s, %s.x);\n\
	%s.y = fma(%s.y, %s, %s.y);\n", out_1, in_1, in_num_x, out_1, out_1, in_conj, in_num_x, out_1);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = fma(%s.y, %s, %s.x);\n\
	%s.y = fma(%s.x, %s, %s.y);\n", out_2, in_1, in_num_y, out_2, out_2, in_conj, in_num_y, out_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkFMAComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = fma(%s.x, %s, %s.x);\n\
	%s.y = fma(%s.y, %s, %s.y);\n", out, in_1, in_num, in_2, out, in_1, in_num, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkFMAReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = fma(%s, %s, %s);\n", out, in_1, in_num, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkMulComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (strcmp(out, in_1) && strcmp(out, in_2)) {
		sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x * %s.x - %s.y * %s.y;\n\
	%s.y = %s.y * %s.x + %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2);
	}
	else {
		if (temp) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x * %s.x - %s.y * %s.y;\n\
	%s.y = %s.y * %s.x + %s.x * %s.y;\n\
	%s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp);
		}
		else
			return VKFFT_ERROR_NULL_TEMP_PASSED;
	}
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkMulComplexConj(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (strcmp(out, in_1) && strcmp(out, in_2)) {
		sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x * %s.x + %s.y * %s.y;\n\
	%s.y = %s.y * %s.x - %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2);
	}
	else {
		if (temp) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x * %s.x + %s.y * %s.y;\n\
	%s.y = %s.y * %s.x - %s.x * %s.y;\n\
	%s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp);
		}
		else
			return VKFFT_ERROR_NULL_TEMP_PASSED;
	}
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkMulComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x * %s;\n\
	%s.y = %s.y * %s;\n", out, in_1, in_num, out, in_1, in_num);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* temp) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (strcmp(out, in_1)) {
		sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = - %s.y * %s;\n\
	%s.y = %s.x * %s;\n", out, in_1, in_num, out, in_1, in_num);
	}
	else {
		if (temp) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = - %s.y * %s;\n\
	%s.y = %s.x * %s;\n\
	%s = %s;\n", temp, in_1, in_num, temp, in_1, in_num, out, temp);
		}
		else
			return VKFFT_ERROR_NULL_TEMP_PASSED;
	}
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkDivComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x / %s;\n\
	%s.y = %s.y / %s;\n", out, in_1, in_num, out, in_1, in_num);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}

static inline VkFFTResult VkMulReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s * %s;\n", out, in_1, in_2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}

static inline VkFFTResult VkShuffleComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (strcmp(out, in_2)) {
		sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x - %s.y;\n\
	%s.y = %s.y + %s.x;\n", out, in_1, in_2, out, in_1, in_2);
	}
	else {
		if (temp) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x - %s.y;\n\
	%s.y = %s.x + %s.y;\n\
	%s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp);
		}
		else
			return VKFFT_ERROR_NULL_TEMP_PASSED;
	}
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkShuffleComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (strcmp(out, in_2)) {
		sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x + %s.y;\n\
	%s.y = %s.y - %s.x;\n", out, in_1, in_2, out, in_1, in_2);
	}
	else {
		if (temp) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x = %s.x + %s.y;\n\
	%s.y = %s.x - %s.y;\n\
	%s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp);
		}
		else
			return VKFFT_ERROR_NULL_TEMP_PASSED;
	}
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkModReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s %% %s;\n", out, in_1, in_num);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkDivReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s / %s;\n", out, in_1, in_num);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult VkPermute(VkFFTSpecializationConstantsLayout* sc, const uint64_t* permute, const uint64_t num_elem, const uint64_t type, char** regIDs, const char* temp) {
	VkFFTResult res = VKFFT_SUCCESS;
	char temp_ID[33][20];
	/*uint64_t permute_complete[33];
	uint64_t num_completed = 0;
	uint64_t start = 0;
	uint64_t start_subcycle = 0;*/
	if (type == 0) {
		for (uint64_t i = 0; i < num_elem; i++)
			sprintf(temp_ID[i], "%s", sc->locID[i]);
		for (uint64_t i = 0; i < num_elem; i++)
			sprintf(sc->locID[i], "%s", temp_ID[permute[i]]);
		/*for (uint64_t i = 0; i < num_elem; i++) {
			permute_complete[i] = 0;
		}
		while (start != num_elem) {
			if (permute_complete[start] == 0) {
				if (start_subcycle == 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", temp, sc->locID[start]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					start_subcycle = start;
				}
				if (permute[start] == start_subcycle) {
					sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", sc->locID[start], temp);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", sc->locID[start], sc->locID[permute[start]]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				permute_complete[start] = 1;
				start = permute[start];
			}
			else {
				start++;
				start_subcycle = 0;
			}
		}*/

	}
	if (type == 1) {
		for (uint64_t i = 0; i < num_elem; i++)
			sprintf(temp_ID[i], "%s", regIDs[i]);
		for (uint64_t i = 0; i < num_elem; i++)
			sprintf(regIDs[i], "%s", temp_ID[permute[i]]);
		/*for (uint64_t i = 0; i < num_elem; i++) {
			permute_complete[i] = 0;
		}
		while (start != num_elem) {
			if (permute_complete[start] == 0) {
				if (start_subcycle == 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", temp, regIDs[start]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					start_subcycle = start;
				}
				if (permute[start] == start_subcycle) {
					sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", regIDs[start], temp);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", regIDs[start], regIDs[permute[start]]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				permute_complete[start] = 1;
				start = permute[start];
			}
			else {
				start++;
				start_subcycle = 0;
			}
		}*/
	}
	return res;
}
static inline VkFFTResult VkSubgroupAdd(VkFFTSpecializationConstantsLayout* sc, const char* in, const char* out, const uint64_t subWarpSplit) {
	VkFFTResult res = VKFFT_SUCCESS;

#if (VKFFT_BACKEND==0)
	sc->tempLen = sprintf(sc->tempStr, "	%s.x = subgroupAdd(%s.x);\n", out, in);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	%s.y = subgroupAdd(%s.y);\n", out, in);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif (VKFFT_BACKEND==1)
	//v1
	/*for (int i = 1; i < sc->warpSize / subWarpSplit; i *= 2) {
		sc->tempLen = sprintf(sc->tempStr, "	%s.x += __shfl_xor_sync(0xffffffff, %s.x, %d);\n", out, in, i);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	%s.y += __shfl_xor_sync(0xffffffff, %s.y, %d);\n", out, in, i);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	//v2
	for (int i = (int)sc->warpSize / 2 / subWarpSplit; i > 0; i /= 2) {
		sc->tempLen = sprintf(sc->tempStr, "	%s.x += __shfl_down_sync(0xffffffff, %s.x, %d);\n", out, in, i);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	%s.y += __shfl_down_sync(0xffffffff, %s.y, %d);\n", out, in, i);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}*/
#endif
	return res;
}

static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration);
static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams);

static inline VkFFTResult appendVersion(VkFFTSpecializationConstantsLayout* sc) {
	VkFFTResult res = VKFFT_SUCCESS;
#if(VKFFT_BACKEND==0)
	sc->tempLen = sprintf(sc->tempStr, "#version 450\n\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
	return res;
}
static inline VkFFTResult appendExtensions(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory) {
	VkFFTResult res = VKFFT_SUCCESS;
#if(VKFFT_BACKEND==0)
	//sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_debug_printf : require\n\n");
	//res = VkAppendLine(sc);
	//if (res != VKFFT_SUCCESS) return res;

	if ((!strcmp(floatType, "double")) || (sc->useUint64)) {
		sc->tempLen = sprintf(sc->tempStr, "\
#extension GL_ARB_gpu_shader_fp64 : enable\n\
#extension GL_ARB_gpu_shader_int64 : enable\n\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	if ((!strcmp(floatTypeInputMemory, "half")) || (!strcmp(floatTypeOutputMemory, "half")) || (!strcmp(floatTypeKernelMemory, "half"))) {
		sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_shader_16bit_storage : require\n\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
#elif(VKFFT_BACKEND==1)
#elif(VKFFT_BACKEND==2)
#ifdef VKFFT_OLD_ROCM
	sc->tempLen = sprintf(sc->tempStr, "\
#include <hip/hip_runtime.h>\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if ((!strcmp(floatType, "double")) || (sc->useUint64)) {
		sc->tempLen = sprintf(sc->tempStr, "\
#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
#endif
	return res;
}
static inline VkFFTResult appendLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc) {
	VkFFTResult res = VKFFT_SUCCESS;
#if(VKFFT_BACKEND==0)
	sc->tempLen = sprintf(sc->tempStr, "layout (local_size_x = %" PRIu64 ", local_size_y = %" PRIu64 ", local_size_z = %" PRIu64 ") in;\n", sc->localSize[0], sc->localSize[1], sc->localSize[2]);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
#elif(VKFFT_BACKEND==2)
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
#endif
	return res;
}
static inline VkFFTResult appendConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name, const char* defaultVal, const char* LFending) {
	VkFFTResult res = VKFFT_SUCCESS;
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sc->tempLen = sprintf(sc->tempStr, "__constant %s %s = %s%s;\n", type, name, defaultVal, LFending);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#else
	sc->tempLen = sprintf(sc->tempStr, "const %s %s = %s%s;\n", type, name, defaultVal, LFending);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
	return res;
}
static inline VkFFTResult appendPushConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name) {
	VkFFTResult res = VKFFT_SUCCESS;
	sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", type, name);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult appendBarrierVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t numTab) {
	VkFFTResult res = VKFFT_SUCCESS;
	char tabs[100];
	for (uint64_t i = 0; i < numTab; i++)
		sprintf(tabs, "	");
#if(VKFFT_BACKEND==0)
	sc->tempLen = sprintf(sc->tempStr, "%sbarrier();\n\n", tabs);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
	sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==2)
	sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sc->tempLen = sprintf(sc->tempStr, "%sbarrier(CLK_LOCAL_MEM_FENCE);\n\n", tabs);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
	return res;
}
static inline VkFFTResult appendPushConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (sc->pushConstantsStructSize == 0)
		return res;
#if(VKFFT_BACKEND==0)
	sc->tempLen = sprintf(sc->tempStr, "layout(push_constant) uniform PushConsts\n{\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
	sc->tempLen = sprintf(sc->tempStr, "	typedef struct {\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==2)
	sc->tempLen = sprintf(sc->tempStr, "	typedef struct {\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sc->tempLen = sprintf(sc->tempStr, "	typedef struct {\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
	if (sc->performWorkGroupShift[0]) {
		res = appendPushConstant(sc, uintType, "workGroupShiftX");
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->performWorkGroupShift[1]) {
		res = appendPushConstant(sc, uintType, "workGroupShiftY");
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->performWorkGroupShift[2]) {
		res = appendPushConstant(sc, uintType, "workGroupShiftZ");
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->performPostCompilationInputOffset) {
		res = appendPushConstant(sc, uintType, "inputOffset");
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->performPostCompilationOutputOffset) {
		res = appendPushConstant(sc, uintType, "outputOffset");
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->performPostCompilationKernelOffset) {
		res = appendPushConstant(sc, uintType, "kernelOffset");
		if (res != VKFFT_SUCCESS) return res;
	}
#if(VKFFT_BACKEND==0)
	sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
	sc->tempLen = sprintf(sc->tempStr, "	}PushConsts;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	__constant__ PushConsts consts;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==2)
	sc->tempLen = sprintf(sc->tempStr, "	}PushConsts;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	__constant__ PushConsts consts;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sc->tempLen = sprintf(sc->tempStr, "	}PushConsts;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
	return res;
}
static inline VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char LFending[4] = "";
	char uintType_32[30];
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
	sprintf(uintType_32, "uint");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(uintType_32, "unsigned int");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(uintType_32, "unsigned int");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sprintf(uintType_32, "unsigned int");
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif

	res = appendConstant(sc, floatType, "loc_PI", "3.1415926535897932384626433832795", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "loc_SQRT1_2", "0.70710678118654752440084436210485", LFending);
	if (res != VKFFT_SUCCESS) return res;
	if (sc->useRader) {
		for (uint64_t i = 0; i < sc->numRaderPrimes; i++) {
			if (sc->raderContainer[i].prime > 0) {
				if (sc->inline_rader_g_pow == 1) {
					uint64_t g_pow = 1;
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
					sc->tempLen = sprintf(sc->tempStr, "__constant %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#else
					sc->tempLen = sprintf(sc->tempStr, "const %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1; t++) {
						g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime;
						sc->tempLen = sprintf(sc->tempStr, ", %" PRIu64 "", g_pow);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "};\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->inline_rader_kernel) {
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
					sc->tempLen = sprintf(sc->tempStr, "__constant %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#else
					sc->tempLen = sprintf(sc->tempStr, "const %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					if (sc->raderContainer[i].type == 0) {
						for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later
							if (!strcmp(floatType, "double")) {
								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (!strcmp(floatType, "float")) {
								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (j < (sc->raderContainer[i].prime - 2)) {
								sc->tempLen = sprintf(sc->tempStr, ", ");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "};\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
					else {
						long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
						for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later
							uint64_t g_pow = 1;
							for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) {
								g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime;
							}
							if (!strcmp(floatType, "double")) {
								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", (double)cos(2.0 * g_pow * double_PI / sc->raderContainer[i].prime), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (!strcmp(floatType, "float")) {
								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", (float)cos(2.0 * g_pow * double_PI / sc->raderContainer[i].prime), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (j < (sc->raderContainer[i].prime - 2)) {
								sc->tempLen = sprintf(sc->tempStr, ", ");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "};\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
					sc->tempLen = sprintf(sc->tempStr, "__constant %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#else
					sc->tempLen = sprintf(sc->tempStr, "const %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					if (sc->raderContainer[i].type == 0) {
						for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later
							if (!strcmp(floatType, "double")) {
								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (!strcmp(floatType, "float")) {
								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}

							if (j < (sc->raderContainer[i].prime - 2)) {
								sc->tempLen = sprintf(sc->tempStr, ", ");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "};\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
					else {
						long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
						for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later
							uint64_t g_pow = 1;
							for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) {
								g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime;
							}
							if (!strcmp(floatType, "double")) {
								double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", (double)(-sin(2.0 * g_pow * double_PI / sc->raderContainer[i].prime)), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (!strcmp(floatType, "float")) {
								float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel;
								sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", (float)(-sin(2.0 * g_pow * double_PI / sc->raderContainer[i].prime)), LFending);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (j < (sc->raderContainer[i].prime - 2)) {
								sc->tempLen = sprintf(sc->tempStr, ", ");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "};\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
				}
			}
		}
	}
	return res;
}
static inline VkFFTResult appendSinCos20(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char functionDefinitions[100] = "";
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(functionDefinitions, "__device__ static __inline__ ");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(functionDefinitions, "__device__ static __inline__ ");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(functionDefinitions, "static __inline__ ");
#endif
	res = appendConstant(sc, floatType, "loc_2_PI", "0.63661977236758134307553505349006", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "loc_PI_2", "1.5707963267948966192313216916398", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a1", "0.99999999999999999999962122687403772", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a3", "-0.166666666666666666637194166219637268", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a5", "0.00833333333333333295212653322266277182", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a7", "-0.000198412698412696489459896530659927773", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a9", "2.75573192239364018847578909205399262e-6", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a11", "-2.50521083781017605729370231280411712e-8", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a13", "1.60590431721336942356660057796782021e-10", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a15", "-7.64712637907716970380859898835680587e-13", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "a17", "2.81018528153898622636194976499656274e-15", LFending);
	if (res != VKFFT_SUCCESS) return res;
	res = appendConstant(sc, floatType, "ab", "-7.97989713648499642889739108679114937e-18", LFending);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
%s%s sincos_20(double x)\n\
{\n\
	//minimax coefs for sin for 0..pi/2 range\n\
	double y = abs(x * loc_2_PI);\n\
	double q = floor(y);\n\
	int quadrant = int(q);\n\
	double t = (quadrant & 1) != 0 ? 1 - y + q : y - q;\n\
	t *= loc_PI_2;\n\
	double t2 = t * t;\n\
	double r = fma(fma(fma(fma(fma(fma(fma(fma(fma(ab, t2, a17), t2, a15), t2, a13), t2, a11), t2, a9), t2, a7), t2, a5), t2, a3), t2 * t, t);\n\
	%s cos_sin;\n\
	cos_sin.x = ((quadrant == 0) || (quadrant == 3)) ? sqrt(1 - r * r) : -sqrt(1 - r * r);\n\
	r = x < 0 ? -r : r;\n\
	cos_sin.y = (quadrant & 2) != 0 ? -r : r;\n\
	return cos_sin;\n\
}\n\n", functionDefinitions, vecType, vecType);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult appendConversion(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeDifferent) {
	VkFFTResult res = VKFFT_SUCCESS;
#if(VKFFT_BACKEND!=0)
	char functionDefinitions[100] = "";
	char vecType[30];
	char vecTypeDifferent[30];
#endif
#if(VKFFT_BACKEND==0)
#elif(VKFFT_BACKEND==1)
	sprintf(functionDefinitions, "__device__ static __inline__ ");
#elif(VKFFT_BACKEND==2)
	sprintf(functionDefinitions, "__device__ static __inline__ ");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sprintf(functionDefinitions, "static __inline__ ");
#endif
#if(VKFFT_BACKEND!=0)
	if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2");
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatTypeDifferent, "half")) sprintf(vecTypeDifferent, "f16vec2");
	if (!strcmp(floatTypeDifferent, "float")) sprintf(vecTypeDifferent, "float2");
	if (!strcmp(floatTypeDifferent, "double")) sprintf(vecTypeDifferent, "double2");
	sc->tempLen = sprintf(sc->tempStr, "\
%s%s conv_%s(%s input)\n\
{\n\
	%s ret_val;\n\
	ret_val.x = (%s) input.x;\n\
	ret_val.y = (%s) input.y;\n\
	return ret_val;\n\
}\n\n", functionDefinitions, vecType, vecType, vecTypeDifferent, vecType, floatType, floatType);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "\
%s%s conv_%s(%s input)\n\
{\n\
	%s ret_val;\n\
	ret_val.x = (%s) input.x;\n\
	ret_val.y = (%s) input.y;\n\
	return ret_val;\n\
}\n\n", functionDefinitions, vecTypeDifferent, vecTypeDifferent, vecType, vecTypeDifferent, floatTypeDifferent, floatTypeDifferent);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#endif
	return res;
}
static inline VkFFTResult appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t inputType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	switch (inputType) {
	case 0: case 1: case 2: case 3: case 4: case 6: {
#if(VKFFT_BACKEND==0)
		if (!strcmp(floatTypeMemory, "half")) {
			sc->inputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->inputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "vec2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->inputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "dvec2");
		}
		if (sc->inputBufferBlockNum == 1) {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
	%s inputs[%" PRIu64 "];\n\
};\n\n", id, vecType, sc->inputBufferBlockSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
	%s inputs[%" PRIu64 "];\n\
} inputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->inputBufferBlockSize, sc->inputBufferBlockNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
#elif(VKFFT_BACKEND==1)
		if (!strcmp(floatTypeMemory, "half")) {
			sc->inputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->inputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "float2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->inputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "double2");
		}
#elif(VKFFT_BACKEND==2)
		if (!strcmp(floatTypeMemory, "half")) {
			sc->inputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->inputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "float2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->inputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "double2");
		}
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		if (!strcmp(floatTypeMemory, "half")) {
			sc->inputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->inputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "float2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->inputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "double2");
		}
#endif
		break;
	}
	case 5: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
	{
		if (!strcmp(floatTypeMemory, "half")) {
			sc->inputNumberByteSize = 2;
			sprintf(vecType, "float16_t");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->inputNumberByteSize = sizeof(float);
			sprintf(vecType, "float");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->inputNumberByteSize = sizeof(double);
			sprintf(vecType, "double");
		}
#if(VKFFT_BACKEND==0)
		if (sc->inputBufferBlockNum == 1) {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
	%s inputs[%" PRIu64 "];\n\
};\n\n", id, vecType, 2 * sc->inputBufferBlockSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\
	%s inputs[%" PRIu64 "];\n\
} inputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->inputBufferBlockSize, sc->inputBufferBlockNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
#endif
		break;
	}
	}
	return res;
}
static inline VkFFTResult appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t outputType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	switch (outputType) {
	case 0: case 1: case 2: case 3: case 4: case 5: {
#if(VKFFT_BACKEND==0)
		if (!strcmp(floatTypeMemory, "half")) {
			sc->outputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->outputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "vec2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->outputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "dvec2");
		}
		if (sc->outputBufferBlockNum == 1) {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
	%s outputs[%" PRIu64 "];\n\
};\n\n", id, vecType, sc->outputBufferBlockSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
	%s outputs[%" PRIu64 "];\n\
} outputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->outputBufferBlockSize, sc->outputBufferBlockNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
#elif(VKFFT_BACKEND==1)
		if (!strcmp(floatTypeMemory, "half")) {
			sc->outputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->outputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "float2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->outputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "double2");
		}
#elif(VKFFT_BACKEND==2)
		if (!strcmp(floatTypeMemory, "half")) {
			sc->outputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->outputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "float2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->outputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "double2");
		}
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		if (!strcmp(floatTypeMemory, "half")) {
			sc->outputNumberByteSize = 2 * 2;
			sprintf(vecType, "f16vec2");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->outputNumberByteSize = 2 * sizeof(float);
			sprintf(vecType, "float2");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->outputNumberByteSize = 2 * sizeof(double);
			sprintf(vecType, "double2");
		}
#endif
		break;
	}
	case 6: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
	{
		if (!strcmp(floatTypeMemory, "half")) {
			sc->outputNumberByteSize = 2;
			sprintf(vecType, "float16_t");
		}
		if (!strcmp(floatTypeMemory, "float")) {
			sc->outputNumberByteSize = sizeof(float);
			sprintf(vecType, "float");
		}
		if (!strcmp(floatTypeMemory, "double")) {
			sc->outputNumberByteSize = sizeof(double);
			sprintf(vecType, "double");
		}
#if(VKFFT_BACKEND==0)
		if (sc->outputBufferBlockNum == 1) {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
	%s outputs[%" PRIu64 "];\n\
};\n\n", id, vecType, 2 * sc->outputBufferBlockSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\
	%s outputs[%" PRIu64 "];\n\
} outputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->outputBufferBlockSize, sc->outputBufferBlockNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
#endif
		break;
	}
	}
	return res;
}
static inline VkFFTResult appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatTypeMemory, "half")) {
		sc->kernelNumberByteSize = 2 * 2;
		sprintf(vecType, "f16vec2");
	}
	if (!strcmp(floatTypeMemory, "float")) {
		sc->kernelNumberByteSize = 2 * sizeof(float);
		sprintf(vecType, "vec2");
	}
	if (!strcmp(floatTypeMemory, "double")) {
		sc->kernelNumberByteSize = 2 * sizeof(double);
		sprintf(vecType, "dvec2");
	}
	if (sc->kernelBlockNum == 1) {
		sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\
	%s kernel_obj[%" PRIu64 "];\n\
};\n\n", id, vecType, sc->kernelBlockSize);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	else {
		sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\
	%s kernel_obj[%" PRIu64 "];\n\
} kernelBlocks[%" PRIu64 "];\n\n", id, vecType, sc->kernelBlockSize, sc->kernelBlockNum);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatTypeMemory, "half")) {
		sc->kernelNumberByteSize = 2 * 2;
		sprintf(vecType, "f16vec2");
	}
	if (!strcmp(floatTypeMemory, "float")) {
		sc->kernelNumberByteSize = 2 * sizeof(float);
		sprintf(vecType, "float2");
	}
	if (!strcmp(floatTypeMemory, "double")) {
		sc->kernelNumberByteSize = 2 * sizeof(double);
		sprintf(vecType, "double2");
	}
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatTypeMemory, "half")) {
		sc->kernelNumberByteSize = 2 * 2;
		sprintf(vecType, "f16vec2");
	}
	if (!strcmp(floatTypeMemory, "float")) {
		sc->kernelNumberByteSize = 2 * sizeof(float);
		sprintf(vecType, "float2");
	}
	if (!strcmp(floatTypeMemory, "double")) {
		sc->kernelNumberByteSize = 2 * sizeof(double);
		sprintf(vecType, "double2");
	}
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatTypeMemory, "half")) {
		sc->kernelNumberByteSize = 2 * 2;
		sprintf(vecType, "f16vec2");
	}
	if (!strcmp(floatTypeMemory, "float")) {
		sc->kernelNumberByteSize = 2 * sizeof(float);
		sprintf(vecType, "float2");
	}
	if (!strcmp(floatTypeMemory, "double")) {
		sc->kernelNumberByteSize = 2 * sizeof(double);
		sprintf(vecType, "double2");
	}
#endif
	return res;
}
static inline VkFFTResult appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") readonly buffer DataLUT {\n\
%s twiddleLUT[];\n\
};\n", id, vecType);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#endif
	return res;
}
static inline VkFFTResult appendRaderUintLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id) {
	VkFFTResult res = VKFFT_SUCCESS;
	char uintType_32[30];
#if(VKFFT_BACKEND==0)
	sprintf(uintType_32, "uint");
	sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") readonly buffer DataRaderUintLUT {\n\
%s g_pow[];\n\
};\n", id, uintType_32);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
	sprintf(uintType_32, "unsigned int");
#elif(VKFFT_BACKEND==2)
	sprintf(uintType_32, "unsigned int");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	sprintf(uintType_32, "unsigned int");
#endif
	return res;
}
static inline VkFFTResult appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
#if(VKFFT_BACKEND==0)
	uint64_t loc_id = id;
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (sc->BluesteinConvolutionStep) {
		sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinConvolutionKernel {\n\
%s BluesteinConvolutionKernel[];\n\
};\n", loc_id, vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		loc_id++;
	}
	if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) {
		sc->tempLen = sprintf(sc->tempStr, "\
layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinMultiplication {\n\
%s BluesteinMultiplication[];\n\
};\n", loc_id, vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		loc_id++;
	}
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#endif
	return res;
}
static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t inputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) {
	VkFFTResult res = VKFFT_SUCCESS;
	switch (inputType % 1000) {
	case 0: case 2: case 3: case 4:case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {//single_c2c + single_c2c_strided
		char inputOffset[30] = "";
		if (sc->inputOffset > 0) {
			sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize);
		}
		else {
			if (sc->performPostCompilationInputOffset) {
				if (inputType < 1000)
					sprintf(inputOffset, "consts.inputOffset + ");
				else
					sprintf(inputOffset, "consts.kernelOffset + ");
			}
		}
		char shiftX[500] = "";
		if (sc->inputStride[0] == 1)
			sprintf(shiftX, "(%s)", index_x);
		else
			sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]);
		char shiftY[500] = "";
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->size[1] > 1) {
			if (sc->numAxisUploads == 1) {
				if (sc->axisSwapped) {
					if (sc->performWorkGroupShift[1])
						sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]);
					else
						sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]);
				}
				else {
					if (sc->performWorkGroupShift[1])
						sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]);
					else
						sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]);
				}
			}
			else {
				if (sc->performWorkGroupShift[1])
					sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]);
				else
					sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]);
			}
		}
		char shiftZ[500] = "";
		if (sc->size[2] > 1) {
			if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
				else
					sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
			}
			else {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]);
				else
					sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]);
			}
		}
		char shiftCoordinate[500] = "";
		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
		if (sc->numCoordinates * sc->matrixConvolution > 1) {
			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]);
		}
		if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
			maxCoordinate = 1;
			sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]);
		}
		char shiftBatch[500] = "";
		if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
			if (sc->convolutionStep && (sc->numKernels > 1)) {
				sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]);
			}
			else
				sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]);
		}
		sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c
		char inputOffset[30] = "";
		if (sc->inputOffset > 0) {
			sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize);
		}
		else {
			if (sc->performPostCompilationInputOffset) {
				if (inputType < 1000)
					sprintf(inputOffset, "consts.inputOffset + ");
				else
					sprintf(inputOffset, "consts.kernelOffset + ");
			}
		}
		char shiftX[500] = "";
		if (sc->inputStride[0] == 1)
			sprintf(shiftX, "(%s)", index_x);
		else
			sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]);

		char shiftY[500] = "";
		if (index_y)
			sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->inputStride[1]);

		char shiftZ[500] = "";
		if (sc->size[2] > 1) {
			if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
				else
					sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]);
			}
			else {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]);
				else
					sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]);
			}
		}
		char shiftCoordinate[500] = "";
		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
		if (sc->numCoordinates * sc->matrixConvolution > 1) {
			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]);
		}
		if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
			maxCoordinate = 1;
			sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]);
		}
		char shiftBatch[500] = "";
		if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
			if (sc->convolutionStep && (sc->numKernels > 1)) {
				sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]);
			}
			else
				sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]);
		}
		sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	}
	return res;
}
static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t outputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) {
	VkFFTResult res = VKFFT_SUCCESS;
	switch (outputType % 1000) {//single_c2c + single_c2c_strided
	case 0: case 2: case 3: case 4: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
		char outputOffset[30] = "";
		if (sc->outputOffset > 0) {
			sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize);
		}
		else {
			if (sc->performPostCompilationOutputOffset) {
				if (outputType < 1000)
					sprintf(outputOffset, "consts.outputOffset + ");
				else
					sprintf(outputOffset, "consts.kernelOffset + ");
			}
		}
		char shiftX[500] = "";
		if (sc->numAxisUploads == 1)
			sprintf(shiftX, "(%s)", index_x);
		else
			sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]);
		char shiftY[500] = "";
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->size[1] > 1) {
			if (sc->numAxisUploads == 1) {
				if (sc->axisSwapped) {
					if (sc->performWorkGroupShift[1])
						sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]);
					else
						sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]);
				}
				else {
					if (sc->performWorkGroupShift[1])
						sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]);
					else
						sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]);
				}
			}
			else {
				if (sc->performWorkGroupShift[1])
					sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]);
				else
					sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]);
			}
		}
		char shiftZ[500] = "";
		if (sc->size[2] > 1) {
			if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
				else
					sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
			}
			else {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]);
				else
					sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]);
			}
		}
		char shiftCoordinate[500] = "";
		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
		if (sc->numCoordinates * sc->matrixConvolution > 1) {
			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]);
		}
		if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
			maxCoordinate = 1;
			sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]);
		}
		char shiftBatch[500] = "";
		if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
			if (sc->convolutionStep && (sc->numKernels > 1)) {
				sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]);
			}
			else
				sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]);
		}
		sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c
		char outputOffset[30] = "";
		if (sc->outputOffset > 0) {
			sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize);
		}
		else {
			if (sc->performPostCompilationOutputOffset) {
				if (outputType < 1000)
					sprintf(outputOffset, "consts.outputOffset + ");
				else
					sprintf(outputOffset, "consts.kernelOffset + ");
			}
		}
		char shiftX[500] = "";
		if (sc->numAxisUploads == 1)
			sprintf(shiftX, "(%s)", index_x);
		else
			sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]);
		char shiftY[500] = "";
		if (index_y)
			sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->outputStride[1]);
		char shiftZ[500] = "";
		if (sc->size[2] > 1) {
			if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
				else
					sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]);
			}
			else {
				if (sc->performWorkGroupShift[2])
					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]);
				else
					sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]);
			}
		}
		char shiftCoordinate[500] = "";
		uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution;
		if (sc->numCoordinates * sc->matrixConvolution > 1) {
			sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]);
		}
		if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) {
			maxCoordinate = 1;
			sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]);
		}
		char shiftBatch[500] = "";
		if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
			if (sc->convolutionStep && (sc->numKernels > 1)) {
				sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]);
			}
			else
				sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]);
		}
		sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;

	}
	}
	return res;
}

static inline VkFFTResult inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t radix, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, char** regID) {
	VkFFTResult res = VKFFT_SUCCESS;
	long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif
	char* temp = sc->temp;
	//sprintf(temp, "loc_0");
	char* w = sc->w;
	//sprintf(w, "w");
	char* iw = sc->iw;
	//sprintf(iw, "iw");
	char convolutionInverse[30] = "";
	if (sc->convolutionStep) sprintf(convolutionInverse, ", %s inverse", uintType);
	switch (radix) {
	case 2: {
		/*if (sc->LUT) {
			sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s LUTId) {\n", vecType, vecType, uintType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s angle) {\n", vecType, vecType, floatType);
		}*/
		/*VkAppendLine(sc, "	{\n");
		sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, temp);
		res = VkAppendLine(sc);
if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	{\n\
	%s temp;\n", vecType);*/
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle);\n", w, cosDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle);\n", w, sinDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = VkMulComplex(sc, temp, regID[1], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[1], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[0], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		/*VkAppendLine(sc, "	}\n");
		sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\
}\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/
		break;
	}
	case 3: {
		/*	if (sc->LUT) {
				sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s LUTId) {\n", vecType, vecType, vecType, uintType);
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s angle) {\n", vecType, vecType, vecType, floatType);
			}*/
		char* tf[2];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 2; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}

		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending);

		/*for (uint64_t i = 0; i < 3; i++) {
			sc->locID[i] = (char*)malloc(sizeof(char) * 50);
			sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
			sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->locID[i]);
			res = VkAppendLine(sc);
if (res != VKFFT_SUCCESS) return res;
			}*/
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 4.0 / 3.0, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 4.0 / 3.0, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 4.0 / 3.0, 4.0 / 3.0);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 4.0 / 3.0, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = VkMulComplex(sc, sc->locID[2], regID[2], w, 0);
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_2.x = temp%s.x * w.x - temp%s.y * w.y;\n\
loc_2.y = temp%s.y * w.x + temp%s.x * w.y;\n", regID[2], regID[2], regID[2], regID[2]);*/
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 / 3.0, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 / 3.0, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 / 3.0, 2.0 / 3.0);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=sincos_20(angle*%.17e%s);\n", w, 2.0 / 3.0, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = VkMulComplex(sc, sc->locID[1], regID[1], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[2], sc->locID[1], sc->locID[2]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s = loc_1 + loc_2;\n\
temp%s = loc_1 - loc_2;\n", regID[1], regID[2]);*/
		res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[2], regID[2], tf[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[0], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_0 = temp%s + temp%s;\n\
loc_1 = temp%s - 0.5 * temp%s;\n\
loc_2 = -0.8660254037844386467637231707529 * temp%s;\n\
temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[1], regID[2], regID[0]);*/

		if (stageAngle < 0)
		{
			res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[2], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[2], sc->locID[1], sc->locID[2], 0);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s.x = loc_1.x - loc_2.y; \n\
temp%s.y = loc_1.y + loc_2.x; \n\
temp%s.x = loc_1.x + loc_2.y; \n\
temp%s.y = loc_1.y - loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/
		}
		else {
			res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[2], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[2], sc->locID[1], sc->locID[2], 0);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s.x = loc_1.x + loc_2.y; \n\
temp%s.y = loc_1.y - loc_2.x; \n\
temp%s.x = loc_1.x - loc_2.y; \n\
temp%s.y = loc_1.y + loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/
		}

		//VkAppendLine(sc, "	}\n");
		for (uint64_t i = 0; i < 2; i++) {
			free(tf[i]);
			tf[i] = 0;
			//free(sc->locID[i]);
		}
		//free(sc->locID[2]);
		break;
	}
	case 4: {
		/*if (sc->LUT)
			sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, uintType, convolutionInverse);
		else
			sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s angle%s) {\n", vecType, vecType, vecType, vecType, floatType, convolutionInverse);
		*/
		//VkAppendLine(sc, "	{\n");
		//sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, temp);
		//res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle);\n", w, cosDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle);\n", w, sinDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = VkMulComplex(sc, temp, regID[2], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[2], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[0], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplex(sc, temp, regID[3], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[3], regID[1], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[1], regID[1], temp);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n\
temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n\
//DIF 2nd stage with angle\n", regID[2], regID[2], regID[2], regID[2], regID[2], regID[0], regID[0], regID[0], regID[3], regID[3], regID[3], regID[3], regID[3], regID[1], regID[1], regID[1]);*/
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s=twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = VkMulComplex(sc, temp, regID[1], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[1], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[0], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x;", temp, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", w, temp);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x;", temp, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", w, temp);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(-w.y, w.x);\n\n", vecType);
		}
		res = VkMulComplex(sc, temp, regID[3], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[3], regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[2], regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;
		//res = VkMovComplex(sc, temp, regID[1]);
		//if (res != VKFFT_SUCCESS) return res;

		uint64_t permute2[4] = { 0,2,1,3 };
		res = VkPermute(sc, permute2, 4, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		/*res = VkMovComplex(sc, regID[1], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;*/
		/*VkAppendLine(sc, "	}\n");
		sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n\
temp = temp%s;\n\
temp%s = temp%s;\n\
temp%s = temp;\n\
}\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2], regID[1], regID[1], regID[2], regID[2]);*/
		break;
	}
	case 5: {
		/*if (sc->LUT) {
			sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
		}*/
		char* tf[5];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 5; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending);
		sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending);
		sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending);
		sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending);

		/*for (uint64_t i = 0; i < 5; i++) {
			sc->locID[i] = (char*)malloc(sizeof(char) * 50);
			sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
			sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->locID[i]);
			res = VkAppendLine(sc);
if (res != VKFFT_SUCCESS) return res;
			}*/
			/*sc->tempLen = sprintf(sc->tempStr, "	{\n\
	%s loc_0;\n	%s loc_1;\n	%s loc_2;\n	%s loc_3;\n	%s loc_4;\n", vecType, vecType, vecType, vecType, vecType);*/

		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\
loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/
		}
		res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[4], sc->locID[1], sc->locID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, sc->locID[3], regID[1], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[4], regID[3], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s = loc_1 + loc_4;\n\
temp%s = loc_2 + loc_3;\n\
temp%s = loc_2 - loc_3;\n\
temp%s = loc_1 - loc_4;\n\
loc_3 = temp%s - temp%s;\n\
loc_4 = temp%s + temp%s;\n", regID[1], regID[2], regID[3], regID[4], regID[1], regID[2], regID[3], regID[4]);*/
		res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkFMAComplex(sc, sc->locID[2], regID[2], tf[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[3], regID[3], tf[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[4], regID[4], tf[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_0 = temp%s + temp%s + temp%s;\n\
loc_1 = temp%s - 0.5 * temp%s;\n\
loc_2 = temp%s - 0.5 * temp%s;\n\
temp%s *= 1.538841768587626701285145288018455;\n\
temp%s *= -0.363271264002680442947733378740309;\n\
loc_3 *= -0.809016994374947424102293417182819;\n\
loc_4 *= -0.587785252292473129168705954639073;\n", regID[0], regID[1], regID[2], regID[0], regID[1], regID[0], regID[2], regID[3], regID[4]);*/
		res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[3], regID[3], sc->locID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[0], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_1 -= loc_3;\n\
loc_2 += loc_3;\n\
loc_3 = temp%s+loc_4;\n\
loc_4 += temp%s;\n\
temp%s = loc_0;\n", regID[3], regID[4], regID[0]);*/

		if (stageAngle < 0)
		{
			res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[4], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[2], sc->locID[2], sc->locID[3], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[3], sc->locID[2], sc->locID[3], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[4], sc->locID[1], sc->locID[4], 0);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s.x = loc_1.x - loc_4.y; \n\
temp%s.y = loc_1.y + loc_4.x; \n\
temp%s.x = loc_2.x - loc_3.y; \n\
temp%s.y = loc_2.y + loc_3.x; \n\
temp%s.x = loc_2.x + loc_3.y; \n\
temp%s.y = loc_2.y - loc_3.x; \n\
temp%s.x = loc_1.x + loc_4.y; \n\
temp%s.y = loc_1.y - loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/
		}
		else {
			res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[2], sc->locID[2], sc->locID[3], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[3], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[4], sc->locID[1], sc->locID[4], 0);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s.x = loc_1.x + loc_4.y; \n\
temp%s.y = loc_1.y - loc_4.x; \n\
temp%s.x = loc_2.x + loc_3.y; \n\
temp%s.y = loc_2.y - loc_3.x; \n\
temp%s.x = loc_2.x - loc_3.y; \n\
temp%s.y = loc_2.y + loc_3.x; \n\
temp%s.x = loc_1.x - loc_4.y; \n\
temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/
		}

		//VkAppendLine(sc, "	}\n");
		for (uint64_t i = 0; i < 5; i++) {
			free(tf[i]);
			tf[i] = 0;
			//free(sc->locID[i]);
		}
		break;
	}
	case 6: {
		char* tf[2];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 2; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}

		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending);
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, regID[i], regID[i], w, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		//important
		//res = VkMovComplex(sc, regID[1], sc->locID[1]);
		//if (res != VKFFT_SUCCESS) return res;

		//uint64_t P = 3;
		uint64_t Q = 2;
		for (uint64_t i = 0; i < Q; i++) {
			res = VkMovComplex(sc, sc->locID[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[1], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;
			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		res = VkMovComplex(sc, temp, regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[1], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[0], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -0.5%s;\n", w, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0.8660254037844386467637231707529%s;\n\n", w, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -0.5%s;\n", w, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -0.8660254037844386467637231707529%s;\n\n", w, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}

		res = VkMulComplex(sc, temp, regID[3], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[3], regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[2], regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;

		sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMulComplex(sc, temp, regID[5], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[5], regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[4], regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;

		uint64_t permute2[6] = { 0,3,4,1,2,5 };
		res = VkPermute(sc, permute2, 6, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		/*res = VkMovComplex(sc, temp, regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[1], regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[3], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[2], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;*/

		for (uint64_t i = 0; i < 2; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		break;
	}
	case 7: {
		/*if (sc->LUT) {
			sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
		}*/
		char* tf[8];

		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 8; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending);
		sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending);
		sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending);
		sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending);
		if (stageAngle < 0) {
			sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending);
			sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending);
			sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending);
			sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending);
		}
		else {
			sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending);
			sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending);
			sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending);
			sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending);
		}
		/*for (uint64_t i = 0; i < 7; i++) {
			sc->locID[i] = (char*)malloc(sizeof(char) * 50);
			sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
			sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->locID[i]);
			res = VkAppendLine(sc);
if (res != VKFFT_SUCCESS) return res;
			}*/
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\
loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/
		}
		res = VkMovComplex(sc, sc->locID[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[0], sc->locID[1], sc->locID[6]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[1], sc->locID[1], sc->locID[6]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[4], sc->locID[4], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[5], sc->locID[4], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_0 = temp%s;\n\
temp%s = loc_1 + loc_6;\n\
temp%s = loc_1 - loc_6;\n\
temp%s = loc_2 + loc_5;\n\
temp%s = loc_2 - loc_5;\n\
temp%s = loc_4 + loc_3;\n\
temp%s = loc_4 - loc_3;\n", regID[0], regID[0], regID[1], regID[2], regID[3], regID[4], regID[5]);*/
		res = VkAddComplex(sc, sc->locID[5], regID[1], regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[1], regID[0], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_5 = temp%s + temp%s + temp%s;\n\
loc_1 = temp%s + temp%s + temp%s;\n\
loc_0 += loc_1;\n", regID[1], regID[3], regID[5], regID[0], regID[2], regID[4]);*/
		res = VkSubComplex(sc, sc->locID[2], regID[0], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, sc->locID[3], regID[4], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, sc->locID[4], regID[2], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_2 = temp%s - temp%s;\n\
loc_3 = temp%s - temp%s;\n\
loc_4 = temp%s - temp%s;\n", regID[0], regID[4], regID[4], regID[2], regID[2], regID[0]);*/
		res = VkSubComplex(sc, regID[0], regID[1], regID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[2], regID[5], regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[4], regID[3], regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s = temp%s - temp%s;\n\
temp%s = temp%s - temp%s;\n\
temp%s = temp%s - temp%s;\n", regID[0], regID[1], regID[5], regID[2], regID[5], regID[3], regID[4], regID[3], regID[1]);*/

		res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[0], regID[0], tf[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[2], regID[2], tf[6]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[4], regID[4], tf[7]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_1 *= -1.16666666666666651863693004997913;\n\
loc_2 *= 0.79015646852540022404554065360571;\n\
loc_3 *= 0.05585426728964774240049351305970;\n\
loc_4 *= 0.73430220123575240531721419756650;\n\
loc_5 *= 0.44095855184409837868031445395900;\n\
temp%s *= 0.34087293062393136944265847887436;\n\
temp%s *= -0.53396936033772524066165487965918;\n\
temp%s *= 0.87484229096165666561546458979137;\n", regID[0], regID[2], regID[4]);*/

		res = VkSubComplex(sc, regID[5], regID[4], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplexInv(sc, regID[6], regID[4], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[4], regID[0], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s = temp%s - temp%s;\n\
temp%s = - temp%s - temp%s;\n\
temp%s = temp%s + temp%s;\n", regID[5], regID[4], regID[2], regID[6], regID[4], regID[0], regID[4], regID[0], regID[2]);*/
		res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[1], sc->locID[2], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[2], sc->locID[4], sc->locID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplexInv(sc, regID[3], sc->locID[2], sc->locID[4]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s = loc_0 + loc_1;\n\
temp%s = loc_2 + loc_3;\n\
temp%s = loc_4 - loc_3;\n\
temp%s = - loc_2 - loc_4;\n", regID[0], regID[1], regID[2], regID[3]);*/
		res = VkAddComplex(sc, sc->locID[1], regID[0], regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[2], regID[0], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[3], regID[0], regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[4], regID[4], sc->locID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[6], regID[6], sc->locID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[0], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
loc_1 = temp%s + temp%s;\n\
loc_2 = temp%s + temp%s;\n\
loc_3 = temp%s + temp%s;\n\
loc_4 = temp%s + loc_5;\n\
loc_6 = temp%s + loc_5;\n\
loc_5 += temp%s;\n\
temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[2], regID[0], regID[3], regID[4], regID[6], regID[5], regID[0]);*/
		res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkShuffleComplexInv(sc, regID[2], sc->locID[3], sc->locID[6], 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[5], 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkShuffleComplexInv(sc, regID[4], sc->locID[2], sc->locID[5], 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkShuffleComplex(sc, regID[5], sc->locID[3], sc->locID[6], 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkShuffleComplex(sc, regID[6], sc->locID[1], sc->locID[4], 0);
		if (res != VKFFT_SUCCESS) return res;

		/*sc->tempLen = sprintf(sc->tempStr, "\
temp%s.x = loc_1.x + loc_4.y; \n\
temp%s.y = loc_1.y - loc_4.x; \n\
temp%s.x = loc_3.x + loc_6.y; \n\
temp%s.y = loc_3.y - loc_6.x; \n\
temp%s.x = loc_2.x - loc_5.y; \n\
temp%s.y = loc_2.y + loc_5.x; \n\
temp%s.x = loc_2.x + loc_5.y; \n\
temp%s.y = loc_2.y - loc_5.x; \n\
temp%s.x = loc_3.x - loc_6.y; \n\
temp%s.y = loc_3.y + loc_6.x; \n\
temp%s.x = loc_1.x - loc_4.y; \n\
temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4], regID[5], regID[5], regID[6], regID[6]);
		VkAppendLine(sc, "	}\n");*/
		/*for (uint64_t i = 0; i < 7; i++) {
			free(sc->locID[i]);
		}*/
		for (uint64_t i = 0; i < 8; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		break;
	}
	case 8: {
		/*if (sc->LUT)
			sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, uintType, convolutionInverse);
		else
			sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s angle%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, floatType, convolutionInverse);
		*/
		//VkAppendLine(sc, "	{\n");
		/*sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, temp);
		res = VkAppendLine(sc);
if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, iw);
			res = VkAppendLine(sc);
if (res != VKFFT_SUCCESS) return res;*/
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle);\n", w, cosDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle);\n", w, sinDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!strcmp(floatType, "double")) {
						sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 4; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 0], regID[i + 0], regID[i + 0]);*/
		}
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 2; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}

		for (uint64_t i = 4; i < 6; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\
temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/
		}
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.25%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.25%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = VkMulComplex(sc, temp, regID[1], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[1], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[0], regID[0], temp);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}
		res = VkMulComplex(sc, temp, regID[3], iw, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[3], regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[2], regID[2], temp);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\
temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2]);*/
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMulComplex(sc, temp, regID[5], iw, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[5], regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[4], regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\
temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n", regID[5], regID[5], regID[5], regID[5], regID[5], regID[4], regID[4], regID[4]);*/
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(iw.y, -iw.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(-iw.y, iw.x);\n\n", vecType);
		}
		res = VkMulComplex(sc, temp, regID[7], w, 0);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubComplex(sc, regID[7], regID[6], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[6], regID[6], temp);
		if (res != VKFFT_SUCCESS) return res;

		uint64_t permute2[8] = { 0,4,2,6,1,5,3,7 };
		res = VkPermute(sc, permute2, 8, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;
		/*
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, temp, regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[1], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, temp, regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[3], regID[6]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[6], temp);
		if (res != VKFFT_SUCCESS) return res;*/
		/*sc->tempLen = sprintf(sc->tempStr, "\
temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\
temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\
temp%s = temp%s - temp;\n\
temp%s = temp%s + temp;\n\n\
temp = temp%s;\n\
temp%s = temp%s;\n\
temp%s = temp;\n\n\
temp = temp%s;\n\
temp%s = temp%s;\n\
temp%s = temp;\n\
}\n\n", regID[7], regID[7], regID[7], regID[7], regID[7], regID[6], regID[6], regID[6], regID[1], regID[1], regID[4], regID[4], regID[3], regID[3], regID[6], regID[6]);
			//VkAppendLine(sc, "	}\n");*/

		break;
	}
	case 9: {
		char* tf[2];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 2; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}

		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending);
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, regID[i], regID[i], w, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		//important
		//res = VkMovComplex(sc, regID[1], sc->locID[1]);
		//if (res != VKFFT_SUCCESS) return res;
		//res = VkMovComplex(sc, regID[2], sc->locID[2]);
		//if (res != VKFFT_SUCCESS) return res;
		uint64_t P = 3;
		uint64_t Q = 3;
		for (uint64_t i = 0; i < Q; i++) {
			res = VkMovComplex(sc, sc->locID[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[1], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;
			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
		}


		for (uint64_t i = 0; i < P; i++) {
			if (i > 0) {
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkMulComplex(sc, sc->locID[1], regID[Q * i + 1], w, temp);
				if (res != VKFFT_SUCCESS) return res;
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkMulComplex(sc, sc->locID[2], regID[Q * i + 2], w, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkMovComplex(sc, sc->locID[1], regID[1]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, sc->locID[2], regID[2]);
				if (res != VKFFT_SUCCESS) return res;
			}

			res = VkAddComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[Q * i], regID[Q * i + 1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[Q * i + 1], tf[0], regID[Q * i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[2], regID[Q * i + 2], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[Q * i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;
			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		uint64_t permute2[9] = { 0,3,6,1,4,7,2,5,8 };
		res = VkPermute(sc, permute2, 9, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		/*res = VkMovComplex(sc, temp, regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[1], regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[3], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[2], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;*/

		for (uint64_t i = 0; i < 2; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		break;
	}
	case 10: {
		char* tf[5];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 5; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending);
		sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending);
		sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending);
		sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending);
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, regID[i], regID[i], w, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		//important
		//res = VkMovComplex(sc, regID[1], sc->locID[1]);
		//if (res != VKFFT_SUCCESS) return res;

		uint64_t P = 5;
		uint64_t Q = 2;
		for (uint64_t i = 0; i < Q; i++) {
			res = VkMovComplex(sc, sc->locID[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[1], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[3], regID[i + Q], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[4], regID[i + 3 * Q], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[2], regID[i + 2 * Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i + 3 * Q], regID[i + 3 * Q], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[3], regID[i + 3 * Q], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;

			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
			}

		}


		for (uint64_t i = 0; i < P; i++) {
			if (i > 0) {
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkMulComplex(sc, temp, regID[Q * i + 1], w, 0);
			}
			else {
				res = VkMovComplex(sc, temp, regID[Q * i + 1]);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}

		uint64_t permute2[10] = { 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 };
		res = VkPermute(sc, permute2, 10, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 5; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		break;
	}
	case 11: {
		char* tf_x[20];
		char* tf_y[20];
		for (uint64_t i = 0; i < 10; i++) {
			tf_x[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf_x[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf_x[j]);
					tf_x[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
			tf_y[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf_y[i]) {
				for (uint64_t j = 0; j < 10; j++) {
					free(tf_x[j]);
					tf_x[j] = 0;
				}
				for (uint64_t j = 0; j < i; j++) {
					free(tf_y[j]);
					tf_y[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf_x[0], "8.4125353283118116886306336876800e-01%s", LFending);
		sprintf(tf_x[1], "-9.5949297361449738990105129410324e-01%s", LFending);
		sprintf(tf_x[2], "-1.4231483827328514046015907335008e-01%s", LFending);
		sprintf(tf_x[3], "-6.5486073394528506407246543075118e-01%s", LFending);
		sprintf(tf_x[4], "4.1541501300188642567903264668505e-01%s", LFending);
		sprintf(tf_x[5], "8.4125353283118116886306336876800e-01%s", LFending);
		sprintf(tf_x[6], "-9.5949297361449738990105129410324e-01%s", LFending);
		sprintf(tf_x[7], "-1.4231483827328514046015907335008e-01%s", LFending);
		sprintf(tf_x[8], "-6.5486073394528506407246543075118e-01%s", LFending);
		sprintf(tf_x[9], "4.1541501300188642567903264668505e-01%s", LFending);
		if (stageAngle < 0) {
			sprintf(tf_y[0], "-5.4064081745559758210122047739077e-01%s", LFending);
			sprintf(tf_y[1], "2.8173255684142969773359373164556e-01%s", LFending);
			sprintf(tf_y[2], "-9.8982144188093273235937163967435e-01%s", LFending);
			sprintf(tf_y[3], "7.5574957435425828375808593451168e-01%s", LFending);
			sprintf(tf_y[4], "9.0963199535451837136413102968824e-01%s", LFending);
			sprintf(tf_y[5], "5.4064081745559758210122047739077e-01%s", LFending);
			sprintf(tf_y[6], "-2.8173255684142969773359373164556e-01%s", LFending);
			sprintf(tf_y[7], "9.8982144188093273235937163967435e-01%s", LFending);
			sprintf(tf_y[8], "-7.5574957435425828375808593451168e-01%s", LFending);
			sprintf(tf_y[9], "-9.0963199535451837136413102968824e-01%s", LFending);
		}
		else {
			sprintf(tf_y[0], "5.4064081745559758210122047739077e-01%s", LFending);
			sprintf(tf_y[1], "-2.8173255684142969773359373164556e-01%s", LFending);
			sprintf(tf_y[2], "9.8982144188093273235937163967435e-01%s", LFending);
			sprintf(tf_y[3], "-7.5574957435425828375808593451168e-01%s", LFending);
			sprintf(tf_y[4], "-9.0963199535451837136413102968824e-01%s", LFending);
			sprintf(tf_y[5], "-5.4064081745559758210122047739077e-01%s", LFending);
			sprintf(tf_y[6], "2.8173255684142969773359373164556e-01%s", LFending);
			sprintf(tf_y[7], "-9.8982144188093273235937163967435e-01%s", LFending);
			sprintf(tf_y[8], "7.5574957435425828375808593451168e-01%s", LFending);
			sprintf(tf_y[9], "9.0963199535451837136413102968824e-01%s", LFending);
		}
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMovComplex(sc, sc->locID[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		uint64_t permute[11] = { 0,1,2,4,8,5,10,9,7,3,6 };
		res = VkPermute(sc, permute, 11, 0, 0, w);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 5; i++) {
			res = VkSubComplex_x(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_x(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_y(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex_y(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 5; i++) {
			res = VkAddComplex_x(sc, regID[0], regID[0], regID[i + 1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_y(sc, regID[0], regID[0], regID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 1; i < 6; i++) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s=%s;\n", sc->locID[i], sc->locID[0]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 6; i < 11; i++) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x=0;\n\
	%s.y=0;\n", sc->locID[i], sc->locID[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 5; i++) {
			for (uint64_t j = 0; j < 5; j++) {
				uint64_t id = ((10 - i) + j) % 10;
				res = VkFMA3Complex_const_w(sc, sc->locID[j + 1], sc->locID[j + 6], regID[i + 1], tf_x[id], tf_y[id], regID[i + 6]);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		for (uint64_t i = 1; i < 6; i++) {
			res = VkSubComplex_x(sc, regID[i], sc->locID[i], sc->locID[i + 5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_y(sc, regID[i], sc->locID[i], sc->locID[i + 5]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 1; i < 6; i++) {
			res = VkAddComplex_x(sc, regID[i + 5], sc->locID[i], sc->locID[i + 5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex_y(sc, regID[i + 5], sc->locID[i], sc->locID[i + 5]);
			if (res != VKFFT_SUCCESS) return res;
		}

		uint64_t permute2[11] = { 0,1,10,3,9,7,2,4,8,5,6 };
		res = VkPermute(sc, permute2, 11, 1, regID, w);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 10; i++) {
			free(tf_x[i]);
			tf_x[i] = 0;
			free(tf_y[i]);
			tf_y[i] = 0;
		}
		//old version
		/*char* tf[50];
		for (uint64_t i = 0; i < 20; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-1.1000000000000000000000000000000e+00%s", LFending);

		sprintf(tf[2], "2.5309761160595911633208743296564e-01%s", LFending);
		sprintf(tf[3], "-1.2882006107736785338602203410119e+00%s", LFending);
		sprintf(tf[4], "3.0463223966921237906291253239033e-01%s", LFending);
		sprintf(tf[5], "-3.9133961551191742689326247273129e-01%s", LFending);
		sprintf(tf[6], "-2.8710222533928502208766531111905e+00%s", LFending);
		sprintf(tf[7], "1.3749079866163838037351752063842e+00%s", LFending);
		sprintf(tf[8], "8.1717813534121219731787277851254e-01%s", LFending);
		sprintf(tf[9], "1.8007465064456784631374830496497e+00%s", LFending);
		sprintf(tf[10], "-8.5949297361449739085514920589048e-01%s", LFending);

		if (stageAngle < 0) {
			sprintf(tf[1], "3.3166247903553996989600705092016e-01%s", LFending);
			sprintf(tf[11], "-2.3734704547482796677115857164608e+00%s", LFending);
			sprintf(tf[12], "-2.4836393087493469078452790199663e-02%s", LFending);
			sprintf(tf[13], "4.7401701751282859786940093727026e-01%s", LFending);
			sprintf(tf[14], "7.4218392777061303888785914750770e-01%s", LFending);
			sprintf(tf[15], "1.4064733090946088811534764317912e+00%s", LFending);
			sprintf(tf[16], "-1.1913645521959481676788072945783e+00%s", LFending);
			sprintf(tf[17], "7.0808888503950306869683117838576e-01%s", LFending);
			sprintf(tf[18], "2.5890826061416793990588303131517e-01%s", LFending);
			sprintf(tf[19], "-4.9929922194110284983104008915689e-02%s", LFending);
		}
		else {
			sprintf(tf[1], "-3.3166247903553996989600705092016e-01%s", LFending);
			sprintf(tf[11], "2.3734704547482796677115857164608e+00%s", LFending);
			sprintf(tf[12], "2.4836393087493469078452790199663e-02%s", LFending);
			sprintf(tf[13], "-4.7401701751282859786940093727026e-01%s", LFending);
			sprintf(tf[14], "-7.4218392777061303888785914750770e-01%s", LFending);
			sprintf(tf[15], "-1.4064733090946088811534764317912e+00%s", LFending);
			sprintf(tf[16], "1.1913645521959481676788072945783e+00%s", LFending);
			sprintf(tf[17], "-7.0808888503950306869683117838576e-01%s", LFending);
			sprintf(tf[18], "-2.5890826061416793990588303131517e-01%s", LFending);
			sprintf(tf[19], "4.9929922194110284983104008915689e-02%s", LFending);
		}
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMovComplex(sc, sc->locID[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 };
		res = VkPermute(sc, permute, 11, 0, 0, w);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 5; i++) {
			res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMovComplex(sc, sc->locID[1], regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 4; i++) {
			res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i + 3], regID[i + 1], regID[5]);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMovComplex(sc, sc->locID[2], regID[6]);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 4; i++) {
			res = VkAddComplex(sc, sc->locID[2], sc->locID[2], regID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i + 7], regID[i + 6], regID[10]);
			if (res != VKFFT_SUCCESS) return res;
		}

		res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, regID[2], sc->locID[2], tf[1], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t k = 0; k < 2; k++) {
			res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], sc->locID[k * 4 + 5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], sc->locID[k * 4 + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 3], sc->locID[k * 4 + 4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 5], sc->locID[k * 4 + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[1], regID[k * 4 + 3], regID[k * 4 + 4]);
			if (res != VKFFT_SUCCESS) return res;

			if (k == 0) {
				res = VkMulComplexNumber(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10]);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
			}

			res = VkAddComplex(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 3]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 4]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[k * 4 + 5], regID[k * 4 + 5], sc->locID[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 4 + 6], regID[k * 4 + 6], sc->locID[1]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 5]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 6]);
			if (res != VKFFT_SUCCESS) return res;

		}
		res = VkAddComplex(sc, regID[1], regID[0], regID[1]);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, sc->locID[5], regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 4; i++) {
			res = VkAddComplex(sc, sc->locID[i + 1], regID[1], regID[i + 3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[5], sc->locID[5], regID[i + 3]);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMovComplex(sc, sc->locID[10], regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 4; i++) {
			res = VkAddComplex(sc, sc->locID[i + 6], regID[2], regID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[10], sc->locID[10], regID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 5; i++) {
			res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}
		uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 };
		res = VkPermute(sc, permute2, 11, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 20; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		*/
		break;
	}
	case 12: {
		char* tf[2];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 2; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending);
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, regID[i], regID[i], w, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		//important
		//res = VkMovComplex(sc, regID[1], sc->locID[1]);
		//if (res != VKFFT_SUCCESS) return res;
		//res = VkMovComplex(sc, regID[2], sc->locID[2]);
		//if (res != VKFFT_SUCCESS) return res;
		uint64_t P = 3;
		uint64_t Q = 4;
		for (uint64_t i = 0; i < Q; i++) {
			res = VkMovComplex(sc, sc->locID[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[1], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;
			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
		}


		for (uint64_t i = 0; i < P; i++) {
			for (uint64_t j = 0; j < Q; j++) {
				if (i > 0) {
					if (stageAngle < 0) {
						sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * j * double_PI / radix), LFending);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * j * double_PI / radix), LFending);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * j * double_PI / radix), LFending);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(2 * i * j * double_PI / radix), LFending);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = VkMulComplex(sc, regID[Q * i + j], regID[Q * i + j], w, temp);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = VkMovComplex(sc, temp, regID[Q * i + 2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[Q * i + 2], regID[Q * i], regID[Q * i + 2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp);
			if (res != VKFFT_SUCCESS) return res;

			res = VkMovComplex(sc, temp, regID[Q * i + 3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[Q * i + 3], regID[Q * i + 1], regID[Q * i + 3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[Q * i + 1], regID[Q * i + 1], temp);
			if (res != VKFFT_SUCCESS) return res;

			res = VkMovComplex(sc, temp, regID[Q * i + 1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], regID[Q * i + 1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp);
			if (res != VKFFT_SUCCESS) return res;

			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, regID[Q * i + 3]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, regID[Q * i + 3]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, regID[Q * i + 3]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, regID[Q * i + 3]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkSubComplex(sc, regID[Q * i + 3], regID[Q * i + 2], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[Q * i + 2], regID[Q * i + 2], temp);
			if (res != VKFFT_SUCCESS) return res;
		}

		uint64_t permute2[12] = { 0,4,8,2,6,10,1,5,9,3,7,11 };
		res = VkPermute(sc, permute2, 12, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 2; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		break;
	}
	case 13: {
		char* tf_x[20];
		char* tf_y[20];
		for (uint64_t i = 0; i < 12; i++) {
			tf_x[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf_x[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf_x[j]);
					tf_x[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
			tf_y[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf_y[i]) {
				for (uint64_t j = 0; j < 12; j++) {
					free(tf_x[j]);
					tf_x[j] = 0;
				}
				for (uint64_t j = 0; j < i; j++) {
					free(tf_y[j]);
					tf_y[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf_x[0], "8.8545602565320989587194927539215e-01%s", LFending);
		sprintf(tf_x[1], "-9.7094181742605202719252621701429e-01%s", LFending);
		sprintf(tf_x[2], "1.2053668025532305345994812592614e-01%s", LFending);
		sprintf(tf_x[3], "-7.4851074817110109868448578063216e-01%s", LFending);
		sprintf(tf_x[4], "-3.5460488704253562600274447824678e-01%s", LFending);
		sprintf(tf_x[5], "5.6806474673115580237845248512407e-01%s", LFending);
		sprintf(tf_x[6], "8.8545602565320989608878970988926e-01%s", LFending);
		sprintf(tf_x[7], "-9.7094181742605202719252621701429e-01%s", LFending);
		sprintf(tf_x[8], "1.2053668025532305324988395500707e-01%s", LFending);
		sprintf(tf_x[9], "-7.4851074817110109863027567200788e-01%s", LFending);
		sprintf(tf_x[10], "-3.5460488704253562600274447824678e-01%s", LFending);
		sprintf(tf_x[11], "5.6806474673115580248687270237262e-01%s", LFending);
		if (stageAngle < 0) {
			sprintf(tf_y[0], "-4.6472317204376854566250792943904e-01%s", LFending);
			sprintf(tf_y[1], "2.3931566428755776706062234626682e-01%s", LFending);
			sprintf(tf_y[2], "9.9270887409805399278096144088934e-01%s", LFending);
			sprintf(tf_y[3], "-6.6312265824079520232193704631918e-01%s", LFending);
			sprintf(tf_y[4], "9.3501624268541482344965776185575e-01%s", LFending);
			sprintf(tf_y[5], "8.2298386589365639468820687318917e-01%s", LFending);
			sprintf(tf_y[6], "4.6472317204376854531014222338126e-01%s", LFending);
			sprintf(tf_y[7], "-2.3931566428755776695220212901827e-01%s", LFending);
			sprintf(tf_y[8], "-9.9270887409805399283517154951362e-01%s", LFending);
			sprintf(tf_y[9], "6.6312265824079520243035726356773e-01%s", LFending);
			sprintf(tf_y[10], "-9.3501624268541482344965776185575e-01%s", LFending);
			sprintf(tf_y[11], "-8.2298386589365639457978665594062e-01%s", LFending);
		}
		else {
			sprintf(tf_y[0], "4.6472317204376854566250792943904e-01%s", LFending);
			sprintf(tf_y[1], "-2.3931566428755776706062234626682e-01%s", LFending);
			sprintf(tf_y[2], "-9.9270887409805399278096144088934e-01%s", LFending);
			sprintf(tf_y[3], "6.6312265824079520232193704631918e-01%s", LFending);
			sprintf(tf_y[4], "-9.3501624268541482344965776185575e-01%s", LFending);
			sprintf(tf_y[5], "-8.2298386589365639468820687318917e-01%s", LFending);
			sprintf(tf_y[6], "-4.6472317204376854531014222338126e-01%s", LFending);
			sprintf(tf_y[7], "2.3931566428755776695220212901827e-01%s", LFending);
			sprintf(tf_y[8], "9.9270887409805399283517154951362e-01%s", LFending);
			sprintf(tf_y[9], "-6.6312265824079520243035726356773e-01%s", LFending);
			sprintf(tf_y[10], "9.3501624268541482344965776185575e-01%s", LFending);
			sprintf(tf_y[11], "8.2298386589365639457978665594062e-01%s", LFending);
		}
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkMovComplex(sc, sc->locID[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		uint64_t permute[13] = { 0, 1, 2, 4, 8, 3, 6, 12, 11, 9, 5, 10, 7 };
		res = VkPermute(sc, permute, 13, 0, 0, w);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 6; i++) {
			res = VkSubComplex_x(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_x(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_y(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex_y(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 6; i++) {
			res = VkAddComplex_x(sc, regID[0], regID[0], regID[i + 1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_y(sc, regID[0], regID[0], regID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 1; i < 7; i++) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s=%s;\n", sc->locID[i], sc->locID[0]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 7; i < 13; i++) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s.x=0;\n\
	%s.y=0;\n", sc->locID[i], sc->locID[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 6; i++) {
			for (uint64_t j = 0; j < 6; j++) {
				uint64_t id = ((12 - i) + j) % 12;
				res = VkFMA3Complex_const_w(sc, sc->locID[j + 1], sc->locID[j + 7], regID[i + 1], tf_x[id], tf_y[id], regID[i + 7]);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		for (uint64_t i = 1; i < 7; i++) {
			res = VkSubComplex_x(sc, regID[i], sc->locID[i], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex_y(sc, regID[i], sc->locID[i], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 1; i < 7; i++) {
			res = VkAddComplex_x(sc, regID[i + 6], sc->locID[i], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex_y(sc, regID[i + 6], sc->locID[i], sc->locID[i + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}

		uint64_t permute2[13] = { 0,1,12,9,11,4,8,2,10,5,3,6,7 };
		res = VkPermute(sc, permute2, 13, 1, regID, w);
		//if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 12; i++) {
			free(tf_x[i]);
			tf_x[i] = 0;
			free(tf_y[i]);
			tf_y[i] = 0;
		}
		for (uint64_t i = 0; i < 12; i++) {
			free(tf_x[i]);
			tf_x[i] = 0;
			free(tf_y[i]);
			tf_y[i] = 0;
		}
		//old version
		/*char* tf[20];
		//char* tf2[4];
		//char* tf2inv[4];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 20; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
			//tf2[i] = (char*)malloc(sizeof(char) * 50);
			//tf2inv[i] = (char*)malloc(sizeof(char) * 50);
		}
		sprintf(tf[0], "-1.08333333333333333e+00%s", LFending);
		sprintf(tf[1], "-3.00462606288665890e-01%s", LFending);
		sprintf(tf[5], "1.00707406572753300e+00%s", LFending);
		sprintf(tf[6], "7.31245990975348148e-01%s", LFending);
		sprintf(tf[7], "-5.79440018900960419e-01%s", LFending);
		sprintf(tf[8], "5.31932498429674383e-01%s", LFending);
		sprintf(tf[9], "-5.08814921720397551e-01%s", LFending);
		sprintf(tf[10], "-7.70585890309231480e-03%s", LFending);

		if (stageAngle < 0) {
			sprintf(tf[2], "-7.49279330626139051e-01%s", LFending);
			sprintf(tf[3], "4.01002128321867324e-01%s", LFending);
			sprintf(tf[4], "1.74138601152135891e-01%s", LFending);
			sprintf(tf[11], "-2.51139331838956803e+00%s", LFending);
			sprintf(tf[12], "-1.82354640868242068e+00%s", LFending);
			sprintf(tf[13], "1.44497990902399609e+00%s", LFending);
			sprintf(tf[14], "-1.34405691517736958e+00%s", LFending);
			sprintf(tf[15], "-9.75932420775945109e-01%s", LFending);
			sprintf(tf[16], "7.73329778651104860e-01%s", LFending);
			sprintf(tf[17], "1.92772511678346858e+00%s", LFending);
			sprintf(tf[18], "1.39973941472918284e+00%s", LFending);
			sprintf(tf[19], "-1.10915484383755047e+00%s", LFending);
		}
		else {
			sprintf(tf[2], "7.49279330626139051e-01%s", LFending);
			sprintf(tf[3], "-4.01002128321867324e-01%s", LFending);
			sprintf(tf[4], "-1.74138601152135891e-01%s", LFending);
			sprintf(tf[11], "2.51139331838956803e+00%s", LFending);
			sprintf(tf[12], "1.82354640868242068e+00%s", LFending);
			sprintf(tf[13], "-1.44497990902399609e+00%s", LFending);
			sprintf(tf[14], "1.34405691517736958e+00%s", LFending);
			sprintf(tf[15], "9.75932420775945109e-01%s", LFending);
			sprintf(tf[16], "-7.73329778651104860e-01%s", LFending);
			sprintf(tf[17], "-1.92772511678346858e+00%s", LFending);
			sprintf(tf[18], "-1.39973941472918284e+00%s", LFending);
			sprintf(tf[19], "1.10915484383755047e+00%s", LFending);
		}
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
			if (res != VKFFT_SUCCESS) return res;

		}
		res = VkMovComplex(sc, sc->locID[0], regID[0]);
		if (res != VKFFT_SUCCESS) return res;
		uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 };
		res = VkPermute(sc, permute, 13, 0, 0, w);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < 6; i++) {
			res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 3; i++) {
			res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], sc->locID[i + 1], sc->locID[i + 4]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 4; i++) {
			res = VkAddComplex(sc, sc->locID[i + 1], regID[i * 3 + 1], regID[i * 3 + 2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], regID[i * 3 + 3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]);
			if (res != VKFFT_SUCCESS) return res;
		}

		res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumber(sc, regID[2], sc->locID[2], tf[1]);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t k = 0; k < 3; k++) {
			res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 3], sc->locID[k * 2 + 4]);

			if (k == 0) {
				res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumberImag(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4], sc->locID[0]);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkMulComplexNumber(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMulComplexNumber(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4]);
				if (res != VKFFT_SUCCESS) return res;
			}

			res = VkAddComplex(sc, regID[k * 2 + 3], sc->locID[k * 2 + 3], regID[k * 2 + 4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 4], regID[k * 2 + 4]);
			if (res != VKFFT_SUCCESS) return res;

		}
		res = VkAddComplex(sc, regID[9], sc->locID[9], sc->locID[11]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[10], sc->locID[10], sc->locID[12]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[11], sc->locID[9], sc->locID[10]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[12], sc->locID[11], sc->locID[12]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[1], regID[9], regID[10]);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMulComplexNumberImag(sc, sc->locID[9], sc->locID[9], tf[11], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, sc->locID[10], sc->locID[10], tf[12], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, regID[11], regID[11], tf[13], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, sc->locID[11], sc->locID[11], tf[14], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, sc->locID[12], sc->locID[12], tf[15], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, regID[12], regID[12], tf[16], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, regID[9], regID[9], tf[17], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, regID[10], regID[10], tf[18], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[19], sc->locID[0]);
		if (res != VKFFT_SUCCESS) return res;

		res = VkAddComplex(sc, sc->locID[9], sc->locID[9], regID[9]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[11], sc->locID[11], regID[9]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[10], sc->locID[10], regID[10]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, sc->locID[12], sc->locID[12], regID[10]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[11], regID[11], sc->locID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[12], regID[12], sc->locID[1]);
		if (res != VKFFT_SUCCESS) return res;

		res = VkAddComplex(sc, regID[9], sc->locID[9], regID[11]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[10], sc->locID[10], regID[11]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[11], sc->locID[11], regID[12]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddComplex(sc, regID[12], sc->locID[12], regID[12]);
		if (res != VKFFT_SUCCESS) return res;

		res = VkAddComplex(sc, regID[1], regID[0], regID[1]);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 4; i++) {
			res = VkAddComplex(sc, sc->locID[i * 3 + 1], regID[i + 1], regID[i * 2 + 5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i * 3 + 3], regID[i + 1], regID[i * 2 + 5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[i * 3 + 2], regID[i + 1], regID[i * 2 + 6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i * 3 + 3], sc->locID[i * 3 + 3], regID[i * 2 + 6]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 3; i++) {
			res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[i + 4], sc->locID[i + 1], sc->locID[i + 4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[i + 1], regID[i + 1]);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 0; i < 6; i++) {
			res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
			if (res != VKFFT_SUCCESS) return res;
		}
		uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 };
		res = VkPermute(sc, permute2, 13, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 20; i++) {
			free(tf[i]);
			tf[i] = 0;
		}*/
		break;
	}
	case 14: {
		char* tf[8];

		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 8; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending);
		sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending);
		sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending);
		sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending);
		if (stageAngle < 0) {
			sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending);
			sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending);
			sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending);
			sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending);
		}
		else {
			sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending);
			sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending);
			sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending);
			sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending);
		}
		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, regID[i], regID[i], w, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		//important
		//res = VkMovComplex(sc, regID[1], sc->locID[1]);
		//if (res != VKFFT_SUCCESS) return res;

		uint64_t P = 7;
		uint64_t Q = 2;
		for (uint64_t i = 0; i < Q; i++) {
			res = VkMovComplex(sc, sc->locID[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[1], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[5], regID[i + 5 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[6], regID[i + 6 * Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[i], sc->locID[1], sc->locID[6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + Q], sc->locID[1], sc->locID[6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + 4 * Q], sc->locID[4], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 5 * Q], sc->locID[4], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[5], regID[i + Q], regID[i + 3 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[i + 5 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[1], regID[i], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkSubComplex(sc, sc->locID[2], regID[i], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[3], regID[i + 4 * Q], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[4], regID[i + 2 * Q], regID[i]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkSubComplex(sc, regID[i], regID[i + Q], regID[i + 5 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2 * Q], regID[i + 5 * Q], regID[i + 3 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4 * Q], regID[i + 3 * Q], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i], regID[i], tf[5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i + 2 * Q], regID[i + 2 * Q], tf[6]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[7]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkSubComplex(sc, regID[i + 5 * Q], regID[i + 4 * Q], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplexInv(sc, regID[i + 6 * Q], regID[i + 4 * Q], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + 4 * Q], regID[i], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], sc->locID[0], sc->locID[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + Q], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[4], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[1], regID[i], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[2], regID[i], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[3], regID[i], regID[i + 3 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[4], regID[i + 4 * Q], sc->locID[5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[6], regID[i + 6 * Q], sc->locID[5]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[i + 5 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[3], sc->locID[6], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[5], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[2], sc->locID[5], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[i + 5 * Q], sc->locID[3], sc->locID[6], 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkShuffleComplex(sc, regID[i + 6 * Q], sc->locID[1], sc->locID[4], 0);
			if (res != VKFFT_SUCCESS) return res;

		}


		for (uint64_t i = 0; i < P; i++) {
			if (i > 0) {
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkMulComplex(sc, temp, regID[Q * i + 1], w, 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkMovComplex(sc, temp, regID[Q * i + 1]);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}

		uint64_t permute2[14] = { 0,2,4,6,8,10,12,1,3,5,7,9,11,13 };
		res = VkPermute(sc, permute2, 14, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 8; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		break;
	}
	case 15: {
		char* tf[5];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 5; i++) {
			tf[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf[j]);
					tf[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}
		sprintf(tf[0], "-0.5%s", LFending);
		sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending);
		sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending);
		sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending);
		sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending);

		char* tf2[2];
		//VkAppendLine(sc, "	{\n");
		for (uint64_t i = 0; i < 2; i++) {
			tf2[i] = (char*)malloc(sizeof(char) * 50);
			if (!tf2[i]) {
				for (uint64_t j = 0; j < i; j++) {
					free(tf2[j]);
					tf2[j] = 0;
				}
				return VKFFT_ERROR_MALLOC_FAILED;
			}
		}

		sprintf(tf2[0], "-0.5%s", LFending);
		sprintf(tf2[1], "-0.8660254037844386467637231707529%s", LFending);

		for (uint64_t i = radix - 1; i > 0; i--) {
			if (stageSize == 1) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (i == radix - 1) {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				else {
					if (sc->LUT) {
						if (sc->useCoalescedLUTUploadToSM) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (stageAngle < 0) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = VkMulComplex(sc, regID[i], regID[i], w, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		//important
		//res = VkMovComplex(sc, regID[1], sc->locID[1]);
		//if (res != VKFFT_SUCCESS) return res;

		uint64_t P = 5;
		uint64_t Q = 3;
		for (uint64_t i = 0; i < Q; i++) {
			res = VkMovComplex(sc, sc->locID[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[1], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, sc->locID[3], regID[i + Q], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[4], regID[i + 3 * Q], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[i + 2 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[2], regID[i + 2 * Q], tf[0], regID[i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i + 3 * Q], regID[i + 3 * Q], tf[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[3], regID[i + 3 * Q], sc->locID[4]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[i + 4 * Q]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;

			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0);
				if (res != VKFFT_SUCCESS) return res;
			}

		}


		for (uint64_t i = 0; i < P; i++) {
			if (i > 0) {
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkMulComplex(sc, sc->locID[1], regID[Q * i + 1], w, temp);
				if (res != VKFFT_SUCCESS) return res;
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)-sin(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %.17e%s;\n\n", w, (double)sin(4 * i * double_PI / radix), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkMulComplex(sc, sc->locID[2], regID[Q * i + 2], w, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkMovComplex(sc, sc->locID[1], regID[1]);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, sc->locID[2], regID[2]);
				if (res != VKFFT_SUCCESS) return res;
			}

			res = VkAddComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2]);
			if (res != VKFFT_SUCCESS) return res;

			res = VkAddComplex(sc, sc->locID[0], regID[Q * i], regID[Q * i + 1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkFMAComplex(sc, sc->locID[1], regID[Q * i + 1], tf2[0], regID[Q * i]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMulComplexNumber(sc, sc->locID[2], regID[Q * i + 2], tf2[1]);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, regID[Q * i], sc->locID[0]);
			if (res != VKFFT_SUCCESS) return res;
			if (stageAngle < 0)
			{
				res = VkShuffleComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplexInv(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				res = VkShuffleComplexInv(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkShuffleComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		uint64_t permute2[15] = { 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14 };
		res = VkPermute(sc, permute2, 15, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t i = 0; i < 5; i++) {
			free(tf[i]);
			tf[i] = 0;
		}
		for (uint64_t i = 0; i < 2; i++) {
			free(tf2[i]);
			tf2[i] = 0;
		}
		break;
	}
	case 16: {
		if (res != VKFFT_SUCCESS) return res;
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle);\n", w, cosDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle);\n", w, sinDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!strcmp(floatType, "double")) {
						sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 8; i++) {
			res = VkMulComplex(sc, temp, regID[i + 8], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 8], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 4; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}

		for (uint64_t i = 8; i < 12; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.25%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.25%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 2; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}
		for (uint64_t i = 4; i < 6; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 8; i < 10; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(iw.y, -iw.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(-iw.y, iw.x);\n\n", vecType);
		}
		for (uint64_t i = 12; i < 14; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}

		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 3 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 3 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.125%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.125%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}

		for (uint64_t i = 0; i < 1; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}
		for (uint64_t i = 2; i < 3; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}


		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 4; i < 5; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, iw, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, iw, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 6; i < 7; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}


		for (uint64_t j = 0; j < 2; j++) {
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		uint64_t permute2[16] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
		res = VkPermute(sc, permute2, 16, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		/*res = VkMovComplex(sc, temp, regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[1], regID[8]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[8], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[2], regID[4]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[4], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[3], regID[12]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[12], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[5], regID[10]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[10], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[7]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[7], regID[14]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[14], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[11]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[11], regID[13]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[13], temp);
		if (res != VKFFT_SUCCESS) return res;*/
		break;
	}
	case 32: {
		if (res != VKFFT_SUCCESS) return res;
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId];\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(angle);\n", w, cosDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(angle);\n", w, sinDef);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!strcmp(floatType, "double")) {
						sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sincos_20(angle);\n", w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 16; i++) {
			res = VkMulComplex(sc, temp, regID[i + 16], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 16], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.5%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.5%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 8; i++) {
			res = VkMulComplex(sc, temp, regID[i + 8], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 8], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}

		for (uint64_t i = 16; i < 24; i++) {
			res = VkMulComplex(sc, temp, regID[i + 8], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 8], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.25%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.25%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		for (uint64_t i = 0; i < 4; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}
		for (uint64_t i = 8; i < 12; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 16; i < 20; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(iw.y, -iw.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", w, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(-iw.y, iw.x);\n\n", vecType);
		}
		for (uint64_t i = 24; i < 28; i++) {
			res = VkMulComplex(sc, temp, regID[i + 4], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 4], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}

		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 3 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 3 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.125%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.125%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}

		for (uint64_t i = 0; i < 2; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}
		for (uint64_t i = 4; i < 6; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}


		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 8; i < 10; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, iw, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, iw, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 12; i < 14; i++) {
			res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}


		for (uint64_t j = 0; j < 2; j++) {
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 16 + 8 * j; i < 18 + 8 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 20 + 8 * j; i < 22 + 8 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 2], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 2], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		if (stageSize == 1) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = 1;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = 0;\n", w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->LUT) {
				if (sc->useCoalescedLUTUploadToSM) {
					sc->tempLen = sprintf(sc->tempStr, "	%s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 4 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 4 * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.y;\n", w, w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s(0.0625%s*angle);\n", w, cosDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s(0.0625%s*angle);\n", w, sinDef, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "	%s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}

		for (uint64_t i = 0; i < 1; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], w, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	w = %s(w.y, -w.x);\n\n", vecType);
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", iw, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	iw = %s(-w.y, w.x);\n\n", vecType);
		}
		for (uint64_t i = 2; i < 3; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}


		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 4; i < 5; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (stageAngle < 0) {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, iw, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkMovComplex(sc, iw, temp);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t i = 6; i < 7; i++) {
			res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
			if (res != VKFFT_SUCCESS) return res;
			res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAddComplex(sc, regID[i], regID[i], temp);
			if (res != VKFFT_SUCCESS) return res;
		}


		for (uint64_t j = 0; j < 2; j++) {
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		for (uint64_t j = 0; j < 4; j++) {
			if ((j == 1) || (j == 2)) {
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (stageAngle < 0) {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			for (uint64_t i = 16 + 4 * j; i < 17 + 4 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (stageAngle < 0) {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = %s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = -%s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "	%s.x = -%s.y;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y = %s.x;\n", temp, iw);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkMovComplex(sc, iw, temp);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t i = 18 + 4 * j; i < 19 + 4 * j; i++) {
				res = VkMulComplex(sc, temp, regID[i + 1], iw, 0);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubComplex(sc, regID[i + 1], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAddComplex(sc, regID[i], regID[i], temp);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		uint64_t permute2[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 };
		res = VkPermute(sc, permute2, 32, 1, regID, temp);
		if (res != VKFFT_SUCCESS) return res;

		/*res = VkMovComplex(sc, temp, regID[1]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[1], regID[16]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[16], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[2]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[2], regID[8]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[8], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[3]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[3], regID[24]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[24], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[5]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[5], regID[20]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[20], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[6]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[6], regID[12]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[12], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[7]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[7], regID[28]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[28], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[9]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[9], regID[18]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[18], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[11]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[11], regID[26]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[26], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[13]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[13], regID[22]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[22], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[15]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[15], regID[30]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[30], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[19]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[19], regID[25]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[25], temp);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovComplex(sc, temp, regID[23]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[23], regID[29]);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovComplex(sc, regID[29], temp);
		if (res != VKFFT_SUCCESS) return res;*/

		break;
	}
	}
	return res;
}
static inline VkFFTResult appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t sharedType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char sharedDefinitions[20] = "";
	uint64_t vecSize = 1;
	uint64_t maxSequenceSharedMemory = 0;
	//uint64_t maxSequenceSharedMemoryPow2 = 0;
	if (!strcmp(floatType, "float"))
	{
#if(VKFFT_BACKEND==0)
		sprintf(vecType, "vec2");
		sprintf(sharedDefinitions, "shared");
#elif(VKFFT_BACKEND==1)
		sprintf(vecType, "float2");
		sprintf(sharedDefinitions, "__shared__");
#elif(VKFFT_BACKEND==2)
		sprintf(vecType, "float2");
		sprintf(sharedDefinitions, "__shared__");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		sprintf(vecType, "float2");
		sprintf(sharedDefinitions, "__local");
#endif
		vecSize = 8;
	}
	if (!strcmp(floatType, "double")) {
#if(VKFFT_BACKEND==0)
		sprintf(vecType, "dvec2");
		sprintf(sharedDefinitions, "shared");
#elif(VKFFT_BACKEND==1)
		sprintf(vecType, "double2");
		sprintf(sharedDefinitions, "__shared__");
#elif(VKFFT_BACKEND==2)
		sprintf(vecType, "double2");
		sprintf(sharedDefinitions, "__shared__");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		sprintf(vecType, "double2");
		sprintf(sharedDefinitions, "__local");
#endif
		vecSize = 16;
	}
	if (sc->useRaderMult) {
		sc->sharedMemSize -= sc->additionalRaderSharedSize * vecSize;
		sc->sharedMemSizePow2 -= sc->additionalRaderSharedSize * vecSize;
	}
	maxSequenceSharedMemory = sc->sharedMemSize / vecSize;
	//maxSequenceSharedMemoryPow2 = sc->sharedMemSizePow2 / vecSize;
	uint64_t mergeR2C = (sc->mergeSequencesR2C && (sc->axis_id == 0)) ? 2 : 0;
	switch (sharedType) {
	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c + single_r2c
	{
		sc->resolveBankConflictFirstStages = 0;
		sc->sharedStrideBankConflictFirstStages = ((sc->fftDim > sc->numSharedBanks / 2) && ((sc->fftDim & (sc->fftDim - 1)) == 0)) ? sc->fftDim / sc->registerBoost * (sc->numSharedBanks / 2 + 1) / (sc->numSharedBanks / 2) : sc->fftDim / sc->registerBoost;
		sc->sharedStrideReadWriteConflict = ((sc->numSharedBanks / 2 <= sc->localSize[1])) ? sc->fftDim / sc->registerBoost + 1 : sc->fftDim / sc->registerBoost + (sc->numSharedBanks / 2) / sc->localSize[1];
		if (sc->sharedStrideReadWriteConflict < sc->fftDim / sc->registerBoost + mergeR2C) sc->sharedStrideReadWriteConflict = sc->fftDim / sc->registerBoost + mergeR2C;
		if (sc->useRaderFFT) {
			uint64_t max_stride = sc->fftDim;
			uint64_t max_shift = 0;
			for (uint64_t i = 0; i < sc->numRaderPrimes; i++) {

				for (uint64_t j = 0; j < sc->raderContainer[i].numStages; j++) {
					if (sc->raderContainer[i].containerFFTNum < 8) {
						uint64_t subLogicalGroupSize = (uint64_t)ceil(sc->raderContainer[i].containerFFTDim / (double)sc->raderContainer[i].registers_per_thread_per_radix[sc->raderContainer[i].stageRadix[j]]); // hopefully it is not <1, will fix 
						uint64_t shift = (subLogicalGroupSize > (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2) : 0;
						if (j == 0) shift = (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2)) ? 0 : 1;
						uint64_t loc_stride = sc->raderContainer[i].containerFFTDim + shift;
						if (sc->raderContainer[i].containerFFTNum * (loc_stride + 1) > max_stride) {
							max_stride = sc->raderContainer[i].containerFFTNum * (loc_stride + 1);
							if (shift > max_shift) max_shift = shift;
						}
					}
				}
			}
			sc->sharedShiftRaderFFT = max_shift;
			sc->sharedStrideRaderFFT = max_stride;
		}

		sc->maxSharedStride = (sc->sharedStrideBankConflictFirstStages < sc->sharedStrideReadWriteConflict) ? sc->sharedStrideReadWriteConflict : sc->sharedStrideBankConflictFirstStages;

		if (sc->useRaderFFT)
			sc->maxSharedStride = (sc->maxSharedStride < sc->sharedStrideRaderFFT) ? sc->sharedStrideRaderFFT : sc->maxSharedStride;

		sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride;
		sc->maxSharedStride = ((sc->sharedMemSize < sc->usedSharedMemory)) ? sc->fftDim / sc->registerBoost : sc->maxSharedStride;

		sc->sharedStrideBankConflictFirstStages = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideBankConflictFirstStages;
		sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideReadWriteConflict;
		if (sc->useRaderFFT) {
			sc->sharedStrideRaderFFT = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideRaderFFT;
			sc->sharedShiftRaderFFT = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? 0 : sc->sharedShiftRaderFFT;
		}
		//sc->maxSharedStride += mergeR2C;
		//printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", sc->maxSharedStride, sc->sharedStrideBankConflictFirstStages, sc->sharedStrideReadWriteConflict, sc->localSize[1], sc->fftDim);
		sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->sharedStrideReadWriteConflict);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride;
		if (sc->useRaderMult) {
			for (uint64_t i = 0; i < 20; i++) {
				sc->RaderKernelOffsetShared[i] += sc->usedSharedMemory / vecSize;
			}
			sc->usedSharedMemory += sc->additionalRaderSharedSize * vecSize;
		}
#if(VKFFT_BACKEND==0)
		sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
#elif(VKFFT_BACKEND==2)
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType);
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
#endif
		break;
	}
	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c + single_c2c_strided
	{
		uint64_t shift = (sc->fftDim < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim : 1;
		sc->sharedStrideReadWriteConflict = ((sc->axisSwapped) && ((sc->localSize[0] % 4) == 0)) ? sc->localSize[0] + shift : sc->localSize[0];
		sc->maxSharedStride = ((maxSequenceSharedMemory < sc->sharedStrideReadWriteConflict* sc->fftDim / sc->registerBoost)) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict;
		sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == sc->localSize[0]) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict;
		sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->maxSharedStride);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->usedSharedMemory = vecSize * sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost;
		if (sc->useRaderMult) {
			for (uint64_t i = 0; i < 20; i++) {
				sc->RaderKernelOffsetShared[i] += sc->usedSharedMemory / vecSize;
			}
			sc->usedSharedMemory += sc->additionalRaderSharedSize * vecSize;
		}
#if(VKFFT_BACKEND==0)
		sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
#elif(VKFFT_BACKEND==1)
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
#elif(VKFFT_BACKEND==2)
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost);
		sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		//sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType);
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
#endif
		break;
	}
	}
	if (sc->useRaderMult) {
		sc->sharedMemSize += sc->additionalRaderSharedSize * vecSize;
		sc->sharedMemSizePow2 += sc->additionalRaderSharedSize * vecSize;
	}
	return res;
}
static inline VkFFTResult appendInitialization(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t initType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char uintType_32[30];
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	sprintf(uintType_32, "uint");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(uintType_32, "unsigned int");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(uintType_32, "unsigned int");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(uintType_32, "unsigned int");
#endif
	//sc->tempLen = sprintf(sc->tempStr, "	uint dum=gl_LocalInvocationID.x;\n");
	uint64_t logicalStoragePerThread = sc->registers_per_thread * sc->registerBoost;
	uint64_t logicalRegistersPerThread = sc->registers_per_thread;
	if (sc->convolutionStep) {
		for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
			sc->tempLen = sprintf(sc->tempStr, "	%s temp_%" PRIu64 ";\n", vecType, i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 ".x=0;\n", i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 ".y=0;\n", i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t j = 1; j < sc->matrixConvolution; j++) {
			for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
				sc->tempLen = sprintf(sc->tempStr, "	%s temp_%" PRIu64 "_%" PRIu64 ";\n", vecType, i, j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 "_%" PRIu64 ".x=0;\n", i, j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 "_%" PRIu64 ".y=0;\n", i, j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
	}
	else {
		for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
			sc->tempLen = sprintf(sc->tempStr, "	%s temp_%" PRIu64 ";\n", vecType, i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 ".x=0;\n", i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 ".y=0;\n", i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
	}
	//sc->tempLen = sprintf(sc->tempStr, "	uint dum=gl_LocalInvocationID.y;//gl_LocalInvocationID.x/gl_WorkGroupSize.x;\n");
	//sc->tempLen = sprintf(sc->tempStr, "	dum=dum/gl_LocalInvocationID.x-1;\n");
	//sc->tempLen = sprintf(sc->tempStr, "	dummy=dummy/gl_LocalInvocationID.x-1;\n");
	sc->regIDs = (char**)malloc(sizeof(char*) * logicalStoragePerThread);
	if (!sc->regIDs) return VKFFT_ERROR_MALLOC_FAILED;
	for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
		sc->regIDs[i] = (char*)malloc(sizeof(char) * 50);
		if (!sc->regIDs[i]) {
			for (uint64_t j = 0; j < i; j++) {
				free(sc->regIDs[j]);
				sc->regIDs[j] = 0;
			}
			free(sc->regIDs);
			sc->regIDs = 0;
			return VKFFT_ERROR_MALLOC_FAILED;
		}
		if (i < logicalRegistersPerThread)
			sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i);
		else
			sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i);
		//sprintf(sc->regIDs[i], "%" PRIu64 "[%" PRIu64 "]", i / logicalRegistersPerThread, i % logicalRegistersPerThread);
		//sprintf(sc->regIDs[i], "s[%" PRIu64 "]", i - logicalRegistersPerThread);

	}
	if (sc->registerBoost > 1) {
		//sc->tempLen = sprintf(sc->tempStr, "	%s sort0;\n", vecType);
		//sc->tempLen = sprintf(sc->tempStr, "	%s temps[%" PRIu64 "];\n", vecType, (sc->registerBoost -1)* logicalRegistersPerThread);
		for (uint64_t i = 1; i < sc->registerBoost; i++) {
			//sc->tempLen = sprintf(sc->tempStr, "	%s temp%" PRIu64 "[%" PRIu64 "];\n", vecType, i, logicalRegistersPerThread);
			for (uint64_t j = 0; j < sc->registers_per_thread; j++) {
				sc->tempLen = sprintf(sc->tempStr, "	%s temp_%" PRIu64 ";\n", vecType, j + i * sc->registers_per_thread);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 ".x=0;\n", j + i * sc->registers_per_thread);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	temp_%" PRIu64 ".y=0;\n", j + i * sc->registers_per_thread);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			/*sc->tempLen = sprintf(sc->tempStr, "\
for(uint i=0; i<%" PRIu64 "; i++)\n\
temp%" PRIu64 "[i]=%s(dum, dum);\n", logicalRegistersPerThread, i, vecType);*/
		}
	}
	sc->tempLen = sprintf(sc->tempStr, "	%s w;\n", vecType);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	w.x=0;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	w.y=0;\n");
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sprintf(sc->w, "w");

	uint64_t maxNonPow2Radix = sc->maxNonPow2Radix;
	for (uint64_t i = 0; i < sc->usedLocRegs; i++) {
		sprintf(sc->locID[i], "loc_%" PRIu64 "", i);
		sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->locID[i]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	%s.x=0;\n", sc->locID[i]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	%s.y=0;\n", sc->locID[i]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	sprintf(sc->temp, "%s", sc->locID[0]);
	if (sc->useRaderFFT) {
		for (uint64_t i = 0; i < 2; i++) {
			sprintf(sc->x0[i], "x0_%" PRIu64 "", i);
			sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->x0[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.x=0;\n", sc->x0[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	%s.y=0;\n", sc->x0[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
	}
	if (sc->useRaderMult) {
		int64_t rader_fft_regs = (sc->useRaderFFT) ? 2 : 0;
		int64_t rader_mult_regs = sc->raderRegisters / 2 - rader_fft_regs;
		if (rader_mult_regs <= (int64_t)sc->usedLocRegs - 1) {
			for (int64_t i = 0; i < rader_mult_regs; i++) {
				sprintf(sc->x0[i + rader_fft_regs], "%s", sc->locID[i + 1]);
			}
		}
		else {
			for (int64_t i = 0; i < (int64_t)sc->usedLocRegs - 1; i++) {
				sprintf(sc->x0[i + rader_fft_regs], "%s", sc->locID[i + 1]);
			}
			for (int64_t i = sc->usedLocRegs - 1; i < rader_mult_regs; i++) {
				sprintf(sc->x0[i + rader_fft_regs], "x0_%" PRIu64 "", i + rader_fft_regs);
				sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->x0[i + rader_fft_regs]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.x=0;\n", sc->x0[i + rader_fft_regs]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "	%s.y=0;\n", sc->x0[i + rader_fft_regs]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
	}
	//sc->tempLen = sprintf(sc->tempStr, "	%s temp2;\n", vecType);
	//res = VkAppendLine(sc);
	//if (res != VKFFT_SUCCESS) return res;
	uint64_t useRadix8plus = 0;
	for (uint64_t i = 0; i < sc->numStages; i++)
		if ((sc->stageRadix[i] == 8) || (sc->stageRadix[i] == 16) || (sc->stageRadix[i] == 32) || (sc->useRaderFFT)) useRadix8plus = 1;
	if (useRadix8plus == 1) {
		if (maxNonPow2Radix > 1) sprintf(sc->iw, "%s", sc->locID[1]);
		else {
			sc->tempLen = sprintf(sc->tempStr, "	%s iw;\n", vecType);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	iw.x=0;\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	iw.y=0;\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sprintf(sc->iw, "iw");
		}
	}
	//sc->tempLen = sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->tempReg);
	sc->tempLen = sprintf(sc->tempStr, "	%s %s=0;\n", uintType, sc->stageInvocationID);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	%s %s=0;\n", uintType, sc->blockInvocationID);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	%s %s=0;\n", uintType, sc->sdataID);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	%s %s=0;\n", uintType, sc->combinedID);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	%s %s=0;\n", uintType, sc->inoutID);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	if ((sc->fftDim < sc->fft_dim_full) || (initType == 1) || (initType == 2)) {
		sc->tempLen = sprintf(sc->tempStr, "	%s disableThreads=1;\n", uintType_32);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	//initialize subgroups ids
	if (sc->useRader) {
		sc->tempLen = sprintf(sc->tempStr, "	%s %s = 0;\n", uintType, sc->raderIDx);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	%s %s = 0;\n", uintType, sc->raderIDx2);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		/*#if((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2))
				sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID");
				sprintf(sc->gl_SubgroupID, "gl_SubgroupID");
				if (sc->localSize[1] == 1) {
					sc->tempLen = sprintf(sc->tempStr, "	%s %s=(threadIdx.x %% %" PRIu64 ");\n", uintType, sc->gl_SubgroupInvocationID, sc->warpSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s %s=(threadIdx.x / %" PRIu64 ");\n", uintType, sc->gl_SubgroupID, sc->warpSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "	%s %s=((threadIdx.x+threadIdx.y*blockDim.x) %% %" PRIu64 ");\n", uintType, sc->gl_SubgroupInvocationID, sc->warpSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "	%s %s=((threadIdx.x+threadIdx.y*blockDim.x) / %" PRIu64 ");\n", uintType, sc->gl_SubgroupID, sc->warpSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
		#endif*/
	}
	if (sc->LUT) {
		sc->tempLen = sprintf(sc->tempStr, "	%s LUTId=0;\n", uintType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	else {
		sc->tempLen = sprintf(sc->tempStr, "	%s angle=0;\n", floatType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	if (((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT)) {
		sc->tempLen = sprintf(sc->tempStr, "	%s mult;\n", vecType);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	mult.x = 0;\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "	mult.y = 0;\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->cacheShuffle) {
		sc->tempLen = sprintf(sc->tempStr, "\
	%s tshuffle= ((%s>>1))%%(%" PRIu64 ");\n\
	%s shuffle[%" PRIu64 "];\n", uintType, sc->gl_LocalInvocationID_x, sc->registers_per_thread, vecType, sc->registers_per_thread);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
			/*sc->tempLen = sprintf(sc->tempStr, "\
shuffle[%" PRIu64 "];\n", i, vecType);*/
			sc->tempLen = sprintf(sc->tempStr, "	shuffle[%" PRIu64 "].x = 0;\n", i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "	shuffle[%" PRIu64 "].y = 0;\n", i);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
	}
	return res;
}
static inline VkFFTResult appendZeropadStart(VkFFTSpecializationConstantsLayout* sc) {
	//return if sequence is full of zeros from the start
	VkFFTResult res = VKFFT_SUCCESS;
	if ((sc->frequencyZeropadding)) {
		switch (sc->axis_id) {
		case 0: {
			break;
		}
		case 1: {
			if (!sc->supportAxis) {
				char idX[500] = "";
				if (sc->performWorkGroupShift[0])
					sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
				else
					sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
				if (sc->performZeropaddingFull[0]) {
					if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}

			}
			break;
		}
		case 2: {
			if (!sc->supportAxis) {
				char idY[500] = "";
				if (sc->performWorkGroupShift[1])//y axis is along z workgroup here
					sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
				else
					sprintf(idY, "%s", sc->gl_GlobalInvocationID_z);

				char idX[500] = "";
				if (sc->performWorkGroupShift[0])
					sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
				else
					sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
				if (sc->performZeropaddingFull[0]) {
					if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			else {
				char idY[500] = "";
				if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup
					sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
				else
					sprintf(idY, "%s", sc->gl_GlobalInvocationID_x);
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			break;
		}
		}
	}
	else {
		switch (sc->axis_id) {
		case 0: {
			char idY[500] = "";
			uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
			if (sc->axisSwapped) {
				if (mult != 1) {
					if (sc->performWorkGroupShift[1])
						sprintf(idY, "((%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")* %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0], mult);
					else
						sprintf(idY, "((%s + %s * %" PRIu64 ")*%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0], mult);
				}
				else {
					if (sc->performWorkGroupShift[1])
						sprintf(idY, "(%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]);
					else
						sprintf(idY, "(%s + %s * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]);
				}
				char idZ[500] = "";
				if (sc->performWorkGroupShift[2])
					sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %%  %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]);
				else
					sprintf(idZ, "%s %%  %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]);
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->performZeropaddingFull[2]) {
					if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			else {
				if (mult != 1) {
					if (sc->performWorkGroupShift[1])
						sprintf(idY, "((%s + consts.workGroupShiftY * %s)* %" PRIu64 ")", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y, mult);
					else
						sprintf(idY, "(%s* %" PRIu64 ")", sc->gl_GlobalInvocationID_y, mult);
				}
				else {
					if (sc->performWorkGroupShift[1])
						sprintf(idY, "(%s + consts.workGroupShiftY * %s)", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y);
					else
						sprintf(idY, "%s", sc->gl_GlobalInvocationID_y);
				}
				char idZ[500] = "";
				if (sc->performWorkGroupShift[2])
					sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %%  %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]);
				else
					sprintf(idZ, "%s %%  %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]);
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->performZeropaddingFull[2]) {
					if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			break;
		}
		case 1: {
			char idZ[500] = "";
			if (sc->performWorkGroupShift[2])
				sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %%  %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]);
			else
				sprintf(idZ, "%s %%  %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]);
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}

			break;
		}
		case 2: {

			break;
		}
		}
	}
	return res;
}
static inline VkFFTResult appendZeropadEnd(VkFFTSpecializationConstantsLayout* sc) {
	//return if sequence is full of zeros from the start
	VkFFTResult res = VKFFT_SUCCESS;
	if ((sc->frequencyZeropadding)) {
		switch (sc->axis_id) {
		case 0: {
			break;
		}
		case 1: {
			if (!sc->supportAxis) {
				char idX[500] = "";
				if (sc->performWorkGroupShift[0])
					sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
				else
					sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
				if (sc->performZeropaddingFull[0]) {
					if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}

			}
			break;
		}
		case 2: {
			if (!sc->supportAxis) {
				char idY[500] = "";
				if (sc->performWorkGroupShift[1])//y axis is along z workgroup here
					sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
				else
					sprintf(idY, "%s", sc->gl_GlobalInvocationID_z);

				char idX[500] = "";
				if (sc->performWorkGroupShift[0])
					sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
				else
					sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
				if (sc->performZeropaddingFull[0]) {
					if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			else {
				char idY[500] = "";
				if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup
					sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
				else
					sprintf(idY, "%s", sc->gl_GlobalInvocationID_x);
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			break;
		}
		}
	}
	else {
		switch (sc->axis_id) {
		case 0: {
			//char idY[500] = "";
			if (sc->performZeropaddingFull[1]) {
				if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		case 1: {
			char idZ[500] = "";
			if (sc->performWorkGroupShift[2])
				sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z);
			else
				sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z);
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		case 2: {

			break;
		}
		}
	}
	return res;
}

static inline VkFFTResult appendZeropadStartReadWriteStage(VkFFTSpecializationConstantsLayout* sc, uint64_t readStage) {
	//return if sequence is full of zeros from the start
	VkFFTResult res = VKFFT_SUCCESS;
	if ((sc->frequencyZeropadding)) {
		switch (sc->axis_id) {
		case 0: {
			break;
		}
		case 1: {
			if (!sc->supportAxis) {
				char idX[500] = "";
				if (readStage) {
					sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->inputStride[1]);
				}
				else {
					sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->outputStride[1]);
				}
				if (sc->performZeropaddingFull[0]) {
					if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}

			}
			break;
		}
		case 2: {
			if (!sc->supportAxis) {
				char idY[500] = "";
				char idX[500] = "";
				if (readStage) {
					sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]);
					sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->inputStride[1]);
				}
				else {
					sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]);
					sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->outputStride[1]);

				}
				if (sc->performZeropaddingFull[0]) {
					if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			else {
				char idY[500] = "";
				if (readStage) {
					sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]);
				}
				else {
					sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]);
				}
				if (sc->performZeropaddingFull[1]) {
					if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			break;
		}
		}
	}
	else {
		switch (sc->axis_id) {
		case 0: {
			char idY[500] = "";
			char idZ[500] = "";
			//uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
			if (readStage) {
				sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]);
				sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]);
			}
			else {
				sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]);
				sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]);

			}
			if (sc->performZeropaddingFull[1]) {
				if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
					sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		case 1: {
			char idZ[500] = "";
			if (readStage) {
				sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]);
			}
			else {
				sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]);
			}
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}

			break;
		}
		case 2: {

			break;
		}
		}
	}
	return res;
}
static inline VkFFTResult appendZeropadEndReadWriteStage(VkFFTSpecializationConstantsLayout* sc) {
	//return if sequence is full of zeros from the start
	VkFFTResult res = VKFFT_SUCCESS;
	if ((sc->frequencyZeropadding)) {
		switch (sc->axis_id) {
		case 0: {
			break;
		}
		case 1: {
			char idX[500] = "";
			if (sc->performWorkGroupShift[0])
				sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x);
			else
				sprintf(idX, "%s", sc->gl_GlobalInvocationID_x);
			if (sc->performZeropaddingFull[0]) {
				if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		case 2: {
			if (sc->performZeropaddingFull[0]) {
				if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (sc->performZeropaddingFull[1]) {
				if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		}
	}
	else {
		switch (sc->axis_id) {
		case 0: {
			if (sc->performZeropaddingFull[1]) {
				if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		case 1: {
			if (sc->performZeropaddingFull[2]) {
				if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			break;
		}
		case 2: {

			break;
		}
		}
	}
	return res;
}
static inline VkFFTResult appendSetSMToZero(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) {
	VkFFTResult res = VKFFT_SUCCESS;
	uint64_t used_registers_read = 1;
	switch (readType) {
	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		break;
	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		break;
	}
	if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
	//appendZeropadStart(sc);
	for (uint64_t k = 0; k < sc->registerBoost; k++) {
		for (uint64_t i = 0; i < used_registers_read; i++) {
			switch (readType) {
			case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
			{
				if (sc->localSize[1] == 1)
					sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
				else
					sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if (sc->axisSwapped) {
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				else {
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				break;
			}
			case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://single_c2c
			{
				if (sc->localSize[1] * (i + 1) > sc->fftDim) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "		sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->localSize[1] * (i + 1) > sc->fftDim) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				break;
			}
			}
		}
	}


	//res = appendZeropadEnd(sc);
	//if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult setReadToRegisters(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) {
	VkFFTResult res = VKFFT_SUCCESS;
	switch (readType) {
	case 0: //single_c2c
	{
		if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->actualInverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	case 1: //grouped_c2c
	{
		if ((sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	case 2: //single_c2c_strided
	{
		if ((sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	case 5://single_r2c
	{
		if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	case 6: //single_c2r
	{
		if ((sc->rader_generator[0] > 0) || ((sc->fftDim % sc->localSize[0]) && (!sc->axisSwapped)) || ((sc->fftDim % sc->localSize[1]) && (sc->axisSwapped)))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143:
	{
		sc->readToRegisters = 0;
		break;
	}
	case 144:
	{
		uint64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1;
		if ((sc->rader_generator[0] > 0) || (sc->fftDim % registers_first_stage))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	case 145:
	{
		uint64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1;
		if ((sc->rader_generator[0] > 0) || (sc->fftDim % registers_first_stage))
			sc->readToRegisters = 0;
		else
			sc->readToRegisters = 1;
		break;
	}
	}
	return res;
}
static inline VkFFTResult appendReadDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) {
	VkFFTResult res = VKFFT_SUCCESS;
	long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
	char vecType[30];
	char inputsStruct[20] = "";
	char LFending[4] = "";
	char uintType_32[30];
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (sc->inputBufferBlockNum == 1)
		sprintf(inputsStruct, "inputs");
	else
		sprintf(inputsStruct, ".inputs");
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
	sprintf(uintType_32, "uint");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(uintType_32, "unsigned int");
	sprintf(inputsStruct, "inputs");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	sprintf(uintType_32, "unsigned int");
	sprintf(inputsStruct, "inputs");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(inputsStruct, "inputs");
	sprintf(uintType_32, "unsigned int");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
#endif
	char convTypeLeft[20] = "";
	char convTypeRight[20] = "";
	if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) {
		if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "float(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "(float)");
			//sprintf(convTypeRight, "");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "(float)");
			//sprintf(convTypeRight, "");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "(float)");
			//sprintf(convTypeRight, "");
#endif
		}
		else {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "vec2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "conv_float2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "conv_float2(");
			sprintf(convTypeRight, ")");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "conv_float2(");
			sprintf(convTypeRight, ")");
#endif
		}
	}
	if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) {
		if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "double(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "(double)");
			//sprintf(convTypeRight, "");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "(double)");
			//sprintf(convTypeRight, "");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "(double)");
			//sprintf(convTypeRight, "");
#endif
		}
		else {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "dvec2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "conv_double2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "conv_double2(");
			sprintf(convTypeRight, ")");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "conv_double2(");
			sprintf(convTypeRight, ")");
#endif
		}
	}
	char index_x[2000] = "";
	char index_y[2000] = "";
	char requestCoordinate[100] = "";
	if (sc->convolutionStep) {
		if (sc->matrixConvolution > 1) {
			sprintf(requestCoordinate, "coordinate");
		}
	}
	char requestBatch[100] = "";
	if (sc->convolutionStep) {
		if (sc->numKernels > 1) {
			sprintf(requestBatch, "0");//if one buffer - multiple kernel convolution
		}
	}
	//appendZeropadStart(sc);
	switch (readType) {
	case 0://single_c2c
	{
		//sc->tempLen = sprintf(sc->tempStr, "	return;\n");
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->axisSwapped) {
			if (sc->performWorkGroupShift[1])
				sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
		}
		else {
			if (sc->performWorkGroupShift[1])
				sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
		}
		char shiftY2[100] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim < sc->fft_dim_full) {
			if (sc->axisSwapped) {
				sc->tempLen = sprintf(sc->tempStr, "		%s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (uint64_t)ceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
				//sc->tempLen = sprintf(sc->tempStr, "		if (numActiveThreads>%" PRIu64 ") numActiveThreads = %" PRIu64 ";\n", sc->localSize[0]* sc->localSize[1], sc->localSize[0]* sc->localSize[1]);
				//sprintf(sc->disableThreadsStart, "		if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsStart, "		if(disableThreads>0) {\n");
				sc->tempLen = sprintf(sc->tempStr, "		if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsEnd, "}");
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "		disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsStart, "		if(disableThreads>0) {\n");
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsEnd, "}");
			}
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "		{ \n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (sc->fftDim == sc->fft_dim_full) {
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
#if (VKFFT_BACKEND!=2) //AMD compiler fix
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#else
							sc->tempLen = sprintf(sc->tempStr, "		if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1], sc->inoutID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#endif
						}

						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
#if (VKFFT_BACKEND!=2) //AMD compiler fix
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#else
							sc->tempLen = sprintf(sc->tempStr, "		if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1], sc->inoutID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#endif
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					}
					else {
						if (sc->axisSwapped) {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						}
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "			%s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "			%s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

				}
			}
		}
		else {
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {
					/*
					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);

					sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
					*/
					if (sc->axisSwapped) {
						if ((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / used_registers_read / (sc->firstStageStartSize / sc->fftDim) > sc->localSize[0]) {
							if (sc->localSize[1] == 1)
								sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
							else
								sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read));
						}
						else {
							if (sc->localSize[1] == 1)
								sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read));
							else
								sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read));
						}
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						//not used
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "			%s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "			%s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->axisSwapped) {
							if (sc->fftDim % sc->localSize[1]) {
								sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (sc->fftDim % sc->localSize[1]) {
								sc->tempLen = sprintf(sc->tempStr, "		}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						else {
							if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
								sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
								sc->tempLen = sprintf(sc->tempStr, "		}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
		}
		sc->tempLen = sprintf(sc->tempStr, "	}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 1://grouped_c2c
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		sc->tempLen = sprintf(sc->tempStr, "		disableThreads = (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sprintf(sc->disableThreadsStart, "		if(disableThreads>0) {\n");

		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
		sprintf(sc->disableThreadsEnd, "}");
		uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		for (uint64_t k = 0; k < sc->registerBoost; k++) {
			for (uint64_t i = 0; i < used_registers_read; i++) {
				sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->zeropadBluestein[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->zeropad[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
				res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, ";\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStartReadWriteStage(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) {
					sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->readToRegisters) {
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "			%s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "			%s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = appendZeropadEndReadWriteStage(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->zeropad[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->zeropadBluestein[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		sc->tempLen = sprintf(sc->tempStr, "	}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 2://single_c2c_strided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);

		//sc->tempLen = sprintf(sc->tempStr, "		if(gl_GlobalInvolcationID.x%s >= %" PRIu64 ") return; \n", shiftX, sc->size[0] / axis->specializationConstants.fftDim);
		sc->tempLen = sprintf(sc->tempStr, "		disableThreads = (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sprintf(sc->disableThreadsStart, "		if(disableThreads>0) {\n");
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
		sprintf(sc->disableThreadsEnd, "}");
		uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		for (uint64_t k = 0; k < sc->registerBoost; k++) {
			for (uint64_t i = 0; i < used_registers_read; i++) {
				sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->zeropadBluestein[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->zeropad[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, ";\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStartReadWriteStage(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) {
					sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->readToRegisters) {
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "			%s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "			%s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = appendZeropadEndReadWriteStage(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->zeropad[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->zeropadBluestein[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		sc->tempLen = sprintf(sc->tempStr, "	}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 5://single_r2c
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->fftDim == sc->fft_dim_full) {
			uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->mergeSequencesR2C) {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s%s[(%s + %" PRIu64 ")]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, sc->inputStride[1], convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = %sinputBlocks[(%s + %" PRIu64 ")/ %" PRIu64 "]%s[(%s + %" PRIu64 ") %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
							else
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (sc->axisSwapped) {

							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;

							if (sc->mergeSequencesR2C) {
								sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								if (sc->inputBufferBlockNum == 1)
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
								else
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (sc->inputBufferBlockNum == 1)
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								else
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (sc->mergeSequencesR2C) {
								sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								if (sc->inputBufferBlockNum == 1)
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
								else
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (sc->inputBufferBlockNum == 1)
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								else
									sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}

					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;

							}

						}
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;

							}

						}
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	case 6: {//single_c2r
		//sc->tempLen = sprintf(sc->tempStr, "	return;\n");
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
		char shiftY2[100] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		if (sc->fftDim < sc->fft_dim_full) {
			//not implemented
			if (sc->axisSwapped)
				sc->tempLen = sprintf(sc->tempStr, "		disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full);
			else
				sc->tempLen = sprintf(sc->tempStr, "		disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sprintf(sc->disableThreadsStart, "		if(disableThreads>0) {\n");
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			sprintf(sc->disableThreadsEnd, "}");
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "		{ \n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
				//num_in =(uint64_t)ceil(num_in / (double)sc->min_registers_per_thread);
				for (uint64_t i = 0; i < num_in; i++) {
					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->inputStride[0], sc->fftDim / 2 + 1, sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
#if (VKFFT_BACKEND!=2) //AMD compiler fix
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#else
							sc->tempLen = sprintf(sc->tempStr, "		if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1], sc->inoutID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#endif
						}
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
#if (VKFFT_BACKEND!=2) //AMD compiler fix
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#else
							sc->tempLen = sprintf(sc->tempStr, "		if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1], sc->inoutID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
#endif
						}
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (0) {
						//not enabled
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (!sc->axisSwapped) {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (0) {
							//not enabled
							sc->tempLen = sprintf(sc->tempStr, "			%s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (!sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->axisSwapped) {
						if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

				}
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 0; i < used_registers_read; i++) {
					if (sc->mergeSequencesR2C) {
						if (sc->axisSwapped) {
							if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) {
								sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) {
									if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)))) {
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		}\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
									}
									else {
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
								else {
									if (sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, sc->fftDim);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									if (sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}else{;\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
							}
						}
						else {
							if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) {
								sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) {
									if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)))) {
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		}\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
									}
									else {
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
								else {
									if (sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, sc->fftDim);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2));
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									if (sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}else{;\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
							}
						}
					}
					else {
						if (sc->axisSwapped) {
							if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) {
								sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) {
									if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)))) {
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		}\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
									}
									else {
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
								else {
									if (sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, sc->fftDim);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									if (sc->localSize[1] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}else{;\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
							}
						}
						else {
							if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) {
								sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) {
									if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)))) {
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										if (sc->zeropadBluestein[0]) {
											sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
											sc->tempLen = sprintf(sc->tempStr, "		}\n");
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
									}
									else {
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
								else {
									if (sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, sc->fftDim);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "			%s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									sc->tempLen = sprintf(sc->tempStr, "		}\n");
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
									if (sc->localSize[0] > sc->fftDim) {
										sc->tempLen = sprintf(sc->tempStr, "		}else{;\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
										sc->tempLen = sprintf(sc->tempStr, "		}\n");
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
							}
						}

					}
				}
			}
			//sc->readToRegisters = 1;
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
			if (!sc->readToRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t k = 0; k < sc->registerBoost; k++) {
					for (uint64_t i = 0; i < used_registers_read; i++) {
						if (sc->axisSwapped) {
							if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						else {
							if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].x = %s.x;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].y = %s.y;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
				}
			}
		}
		else {

		}
		sc->tempLen = sprintf(sc->tempStr, "	}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 110://DCT-I nonstrided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
			}
			sc->fftDim = (sc->fftDim + 2) / 2;
			uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]);
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < num_in; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;

							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID] = sdata[sdataID];\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ")  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID] = sdata[sdataID];\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->axisSwapped) {
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ")  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}

						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->axisSwapped) {
							sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID] = sdata[sdataID];\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID] = sdata[sdataID];\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			sc->fftDim = 2 * sc->fftDim - 2;
			if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
		}
		else {
			//Not implemented
		}
		break;
	}
	case 111://DCT-I strided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftX2[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
			}
			sc->fftDim = (sc->fftDim + 2) / 2;
			uint64_t num_in = (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]);
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < num_in; i++) {

					//sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;

					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		//sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->mergeSequencesR2C) {
						sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);

						sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]);
					}
					else {
						sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
						sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
					}
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->mergeSequencesR2C) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID] = sdata[sdataID];\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID] = sdata[sdataID];\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			sc->fftDim = 2 * sc->fftDim - 2;
			if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
		}
		else {
			//Not implemented
		}
		break;
	}
	case 120://DCT-II nonstrided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;

							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->axisSwapped) {
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}

						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	case 121://DCT-II strided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftX2[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < mult * used_registers_read; i++) {

					//sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;

					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + mult * k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult, sc->localSize[0], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + %s;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->mergeSequencesR2C) {
						sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);

						sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * used_registers_read) * sc->localSize[1]);
					}
					else {
						sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
						sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]);
					}
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->mergeSequencesR2C) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + mult * k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	case 130://DCT-III nonstrided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
			}
			uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]);
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < num_in; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (!sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "			%s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]);
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "			%s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[sc->axis_id + 1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[sc->axis_id + 1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->mergeSequencesR2C) {
						sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s%s[inoutID]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			%s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->mergeSequencesR2C) {
						sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s%s[inoutID]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			} else {\n");
					res = VkAppendLine(sc);

					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			}\n");

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->axisSwapped) {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) {
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1])
						{
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
		}
		else {
			//Not implemented
		}
		break;
	}
	case 131://DCT-III strided
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftX2[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]);

		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < num_in; i++) {

					//sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;

					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1))
					{
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->mergeSequencesR2C) {
						sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);

						sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]);
					}
					else {
						sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
						sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
					}
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C) {
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 " + combinedID];\n", sc->startDCT3LUT);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(%.17e%s * (combinedID) );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(%.17e%s * (combinedID) );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(%.17e%s * (combinedID) );\n", (double)(double_PI / 2 / sc->fftDim), LFending);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//sc->tempLen = sprintf(sc->tempStr, "		printf(\" %%f - %%f \\n\", mult.x, mult.y);\n");
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;
					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		//sdataID = (combinedID) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID) * sharedStride + %s;\n", sc->gl_LocalInvocationID_x);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			if (combinedID  > 0){\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->mergeSequencesR2C) {
						sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);

						sprintf(index_y, "(%" PRIu64 " - (%s/%" PRIu64 " + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]);
					}
					else {
						sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
						sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]);
					}
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C) {

					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = ((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID) * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = -((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			} else {\n");
					res = VkAppendLine(sc);

					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x)*mult.x-(%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = ((%s.y)*mult.x+(%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;

					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1))
					{
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full;
		}
		else {
			//Not implemented
		}
		break;
	}
	case 140://DCT-IV nonstrided cast to 8x FFT
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->axisSwapped) {
			if (sc->performWorkGroupShift[1])
				sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
		}
		else {
			if (sc->performWorkGroupShift[1])
				sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
		}
		char shiftY2[100] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		if (sc->fftDim < sc->fft_dim_full) {
			if (sc->axisSwapped) {
				sc->tempLen = sprintf(sc->tempStr, "		%s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / sc->min_registers_per_thread / (sc->firstStageStartSize / sc->fftDim), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim));
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsStart, "		if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full);
				sc->tempLen = sprintf(sc->tempStr, "		if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsEnd, "}");
			}
			else {
				sprintf(sc->disableThreadsStart, "		if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full);
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(sc->disableThreadsEnd, "}");
			}
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "		{ \n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[1]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[1]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		if (sc->fftDim == sc->fft_dim_full) {
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->inputStride[0], sc->fftDim / 8, sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdata[2*(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdata[2*(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "			%s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

				}
			}
		}
		/*else {
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
					if (sc->axisSwapped) {
						if (sc->localSize[1] == 1)
							sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
						else
							sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "			%s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "			%s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->axisSwapped) {

							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->readToRegisters) {
							sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->axisSwapped) {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "			sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
								sc->tempLen = sprintf(sc->tempStr, "			sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
		}*/
		sc->tempLen = sprintf(sc->tempStr, "	}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 141://DCT-IV strided cast to 8x FFT
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		if (sc->fftDim != sc->fft_dim_full) {
			sprintf(sc->disableThreadsStart, "		if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]);
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;

			sprintf(sc->disableThreadsEnd, "}");
		}
		else {
			sprintf(sc->disableThreadsStart, "{\n");
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			sprintf(sc->disableThreadsEnd, "}");
		}
		sc->tempLen = sprintf(sc->tempStr, "		%s.x = 0;\n", sc->regIDs[1]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[1]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		for (uint64_t k = 0; k < sc->registerBoost; k++) {
			for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) {
				if (sc->fftDim == sc->fft_dim_full)
					sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]);
				else
					sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				sc->tempLen = sprintf(sc->tempStr, "		if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->zeropad[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
				res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, ";\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if (sc->inputBufferBlockNum == 1)
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
				else
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		%s.y = 0;\n", sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(2*(%s+%" PRIu64 ")+1)+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		%s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 + 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if (sc->zeropad[0]) {
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->readToRegisters) {
						sc->tempLen = sprintf(sc->tempStr, "			%s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "			sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		sc->tempLen = sprintf(sc->tempStr, "	}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		break;
	}
	case 142://DCT-IV nonstrided as 2xN/2 DCT-II
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
			uint64_t maxBluesteinCutOff = 1;
			if (sc->zeropadBluestein[0]) {
				if (sc->axisSwapped)
					maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[0];
				else
					maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[1];
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < 2 * used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1]));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1]));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#else
					if (i < used_registers_read) {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s%s[%s]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
#endif
#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
					if (sc->axisSwapped) {
						//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
					}
					else {
						//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2)  + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		else {\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->axisSwapped) {
							//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
						}
						else {
							//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2)  + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
						}
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < 2 * used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1]));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1]));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
					}
					else {
						//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2)  + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (i < used_registers_read) {
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->axisSwapped) {
							//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
						}
						else {
							//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2)  + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
						}
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < 2 * used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1]));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1]));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
					}
					else {
						//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2)  + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (i < used_registers_read) {
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->axisSwapped) {
							//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim);
						}
						else {
							//sc->tempLen = sprintf(sc->tempStr, "		sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2))  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim);
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID %% %" PRIu64 ")/2)  + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim);
						}
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

				}
			}
#endif
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->zeropadBluestein[0]) {
				if (sc->axisSwapped)
					maxBluesteinCutOff = sc->fftDim * sc->localSize[0];
				else
					maxBluesteinCutOff = sc->fftDim * sc->localSize[1];
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ")  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[sdataID-sharedStride].y;\n", sc->w);
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[sdataID-1].y;\n", sc->w);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = sdata[sdataID].x;\n", sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim - 1, sc->fftDim);
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 ")  + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim - 1, sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					/*sc->tempLen = sprintf(sc->tempStr, "		printf(\" %%f  %%f  %%d\\n\", %s.x, %s.y, %s);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;*/
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim);
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					/*sc->tempLen = sprintf(sc->tempStr, "		printf(\" %%f  %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;*/
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim);
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					/*sc->tempLen = sprintf(sc->tempStr, "		printf(\" %%f  %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;*/
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
#endif
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]);

			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < num_in; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[sdataID];\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[inoutID];\n", sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			} \n");
					res = VkAppendLine(sc);

					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			if (combinedID %% %" PRIu64 " == 0){\n", sc->fftDim / 2 + 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	case 143://DCT-IV strided as 2xN/2 DCT-II
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftX2[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < 2 * used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
					sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]);
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#else
					if (i < used_registers_read) {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s%s[%s]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
#endif
#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.x;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < 2 * used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
					sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]);
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (i < used_registers_read) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < 2 * used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0]));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
					sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]);
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (i < used_registers_read) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						sc->tempLen = sprintf(sc->tempStr, "		sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		} else {\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
#endif
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = sdata[sdataID-sharedStride].y;\n", sc->w);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = sdata[sdataID].x;\n", sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim - 1, sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					//sc->tempLen = sprintf(sc->tempStr, "		printf(\" %%f  %%f\\n\", %s.x, %s.y);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are  not handling barrier with thread-conditional writes to local memory - so this is a work-around
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#endif
					sc->tempLen = sprintf(sc->tempStr, "		}else{\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]);

					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
#endif
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]);

			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < num_in; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(%.17e%s * (combinedID / %" PRIu64 ") );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(%.17e%s * (combinedID / %" PRIu64 ") );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(%.17e%s * (combinedID / %" PRIu64 ") );\n", (double)(double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[sdataID];\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			if (combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		inoutID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[inoutID];\n", sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "			} else {\n");
					res = VkAppendLine(sc);

					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	case 144://odd DCT-IV nonstrided as N FFT
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {

					if (sc->localSize[1] == 1)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inputStride[0] > 1)
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]);
					else
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;

							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->inputBufferBlockNum == 1)
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
						else
							sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->mergeSequencesR2C) {
							sc->tempLen = sprintf(sc->tempStr, "		inoutID += %" PRIu64 ";\n", sc->inputStride[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->inputBufferBlockNum == 1)
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							else
								sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->axisSwapped) {
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}

						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->axisSwapped) {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) {
							sc->tempLen = sprintf(sc->tempStr, "		}");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {
					if (!sc->axisSwapped) {
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = %" PRIu64 " + 4 * (combinedID %% %" PRIu64 ");\n", sc->fftDim / 2, sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						sc->tempLen = sprintf(sc->tempStr, "		if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = sdataID  + %s * sharedStride;\n", sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
			%s.x = -%s.x;\n\
			%s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
			%s.x = -%s.x;\n\
			%s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "		inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						sc->tempLen = sprintf(sc->tempStr, "		if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
			%s.x = -%s.x;\n\
			%s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
			%s.x = -%s.x;\n\
			%s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (sc->zeropadBluestein[0]) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			}
			if (!sc->readToRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t k = 0; k < sc->registerBoost; k++) {
					for (uint64_t i = 0; i < used_registers_read; i++) {
						if (sc->axisSwapped) {
							if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						else {
							if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].x = %s.x;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].y = %s.y;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "		}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
				}
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	case 145://odd DCT-IV strided as N FFT
	{
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftX2[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		char shiftY[500] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1;
		uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost;
		if (sc->fftDim == sc->fft_dim_full) {
			if (sc->zeropadBluestein[0]) {
				res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id];
				used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			}
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < mult * used_registers_read; i++) {

					//sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1]);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;

					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->mergeSequencesR2C)
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult);
					else
						sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->mergeSequencesR2C) {
						sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x);

						sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * used_registers_read) * sc->localSize[1]);
					}
					else {
						sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x);
						sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]);
					}
					res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStartReadWriteStage(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->inputBufferBlockNum == 1)
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight);
					else
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					res = appendZeropadEndReadWriteStage(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropad[0]) {
						sc->tempLen = sprintf(sc->tempStr, "	}else{\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].x = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		sdata[sdataID].y = 0;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "	}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				for (uint64_t i = 0; i < used_registers_read; i++) {
					sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "		inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "		if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
			%s.x = -%s.x;\n\
			%s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\
			%s.x = -%s.x;\n\
			%s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->zeropadBluestein[0]) {
						sc->tempLen = sprintf(sc->tempStr, "		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->zeropadBluestein[0]) {
				sc->fftDim = sc->fft_dim_full;
				used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			}
			if (!sc->readToRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t k = 0; k < sc->registerBoost; k++) {
					for (uint64_t i = 0; i < used_registers_read; i++) {
						if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "		if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "			sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		else {
			//Not implemented
		}
		break;
	}
	}
	return res;
}

static inline VkFFTResult appendReorder4StepRead(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif

	uint64_t logicalRegistersPerThread = (sc->rader_generator[0] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[0]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
	switch (reorderType) {
	case 1: {//grouped_c2c
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		if ((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)) {
			if (!sc->readToRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
			}
			/*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->readToRegisters = 0;
			}
			else
				sc->readToRegisters = 1;*/
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) {
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
				if (sc->LUT) {
					sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!sc->inverse) {
						sc->tempLen = sprintf(sc->tempStr, "	mult.y = -mult.y;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "		angle = 2 * loc_PI * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!strcmp(floatType, "float")) {
						sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(angle);\n", cosDef);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(angle);\n", sinDef);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						//sc->tempLen = sprintf(sc->tempStr, "		mult = %s(cos(angle), sin(angle));\n", vecType);
					}
					if (!strcmp(floatType, "double")) {
						sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(angle);\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->readToRegisters) {
					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = w.x;\n", sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].x = w.x;\n", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
		}

		break;
	}
	case 2: {//single_c2c_strided
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		if ((!sc->reorderFourStep) && (sc->inverse)) {
			if (!sc->readToRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
			}
			/*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) {
				res = appendBarrierVkFFT(sc, 1);
				sc->readToRegisters = 0;
			}
			else
				sc->readToRegisters = 1;*/
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) {
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
				if (sc->LUT) {
					sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!sc->inverse) {
						sc->tempLen = sprintf(sc->tempStr, "	mult.y = -mult.y;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "		angle = 2 * loc_PI * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (!strcmp(floatType, "float")) {
						sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(angle);\n", cosDef);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(angle);\n", sinDef);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						//sc->tempLen = sprintf(sc->tempStr, "		mult = %s(cos(angle), sin(angle));\n", vecType);
					}
					if (!strcmp(floatType, "double")) {
						sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(angle);\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (sc->readToRegisters) {
					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = w.x;\n", sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].x = w.x;\n", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		//appendBarrierVkFFT(sc, 1);
		break;
	}
	}
	return res;
}
static inline VkFFTResult appendReorder4StepWrite(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif

	uint64_t logicalRegistersPerThread = (sc->rader_generator[sc->numStages - 1] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
	switch (reorderType) {
	case 1: {//grouped_c2c
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		if ((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) {
			if (!sc->writeFromRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
			}
			/*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->writeFromRegisters = 0;
			}
			else
				sc->writeFromRegisters = 1;*/
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) {
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
				if (sc->LUT) {
					sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!sc->inverse) {
						sc->tempLen = sprintf(sc->tempStr, "	mult.y = -mult.y;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "		angle = 2 * loc_PI * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inverse) {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(angle);\n", cosDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(angle);\n", sinDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "		mult = %s(cos(angle), sin(angle));\n", vecType);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(angle);\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(angle);\n", cosDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = -%s(angle);\n", sinDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "		mult = %s(cos(angle), sin(angle));\n", vecType);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(-angle);\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				if (sc->writeFromRegisters) {
					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = w.x;\n", sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].x = w.x;\n", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		break;
	}
	case 2: {//single_c2c_strided
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
		if (!((!sc->reorderFourStep) && (sc->inverse))) {
			if (!sc->writeFromRegisters) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
			}
			/*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				sc->writeFromRegisters = 0;
			}
			else
				sc->writeFromRegisters = 1;*/
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) {
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread;
				if (sc->LUT) {
					sc->tempLen = sprintf(sc->tempStr, "		mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!sc->inverse) {
						sc->tempLen = sprintf(sc->tempStr, "	mult.y = -mult.y;\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "		angle = 2 * loc_PI * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->inverse) {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(angle);\n", cosDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = %s(angle);\n", sinDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "		mult = %s(cos(angle), sin(angle));\n", vecType);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(angle);\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult.x = %s(angle);\n", cosDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		mult.y = -%s(angle);\n", sinDef);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "		mult = %s(cos(angle), sin(angle));\n", vecType);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		mult = sincos_20(-angle);\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				if (sc->writeFromRegisters) {
					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = w.x;\n", sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s].x = w.x;\n", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) {
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		//appendBarrierVkFFT(sc, 1);
		break;
	}
	}
	return res;
}

static inline VkFFTResult appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t strideType, uint64_t pre_or_post_multiplication) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	//char cosDef[20] = "cos";
	//char sinDef[20] = "sin";
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	//char cosDef[20] = "__cosf";
	//char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	//char cosDef[20] = "__cosf";
	//char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	//char cosDef[20] = "native_cos";
	//char sinDef[20] = "native_sin";
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif
	char shiftX[500] = "";
	if (sc->performWorkGroupShift[0])
		sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
	char index_x[2000] = "";
	//char index_y[2000] = "";
	//char requestBatch[100] = "";
	//char separateRegisterStore[100] = "";
	char kernelName[100] = "";
	sprintf(kernelName, "BluesteinMultiplication");
	if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
	}
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;

	uint64_t used_registers = 1;
	switch (strideType) {
	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		used_registers = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		break;
	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		used_registers = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		break;
	}
	for (uint64_t i = 0; i < used_registers; i++) {
		switch (strideType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		{
			if (sc->localSize[0] * ((1 + i)) > sc->fftDim) {
				uint64_t current_group_cut = sc->fftDim - i * sc->localSize[0];
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		{
			if (sc->localSize[1] * ((1 + i)) > sc->fftDim) {
				uint64_t current_group_cut = sc->fftDim - i * sc->localSize[1];
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		}
		switch (strideType) {
		case 0: case 2: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		{
			if (sc->fftDim == sc->fft_dim_full) {
				sc->tempLen = sprintf(sc->tempStr, "		%s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);

				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sprintf(index_x, " (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim);
				sc->tempLen = sprintf(sc->tempStr, "		%s = %s;\n", sc->inoutID, index_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				//sc->tempLen = sprintf(sc->tempStr, "		inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
			}
			break;
		}
		case 1: case 111: case 121: case 131: case 141: case 143: case 145:
		{
			if (sc->fftDim == sc->fft_dim_full) {
				sc->tempLen = sprintf(sc->tempStr, "		%s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "		%s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		}
		if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) {
			sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) {
			sc->tempLen = sprintf(sc->tempStr, "		if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		sc->tempLen = sprintf(sc->tempStr, "		w = %s[%s];\n", kernelName, sc->inoutID);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		//uint64_t k = 0;
		if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
			if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride];\n", sc->regIDs[i], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		if (sc->inverseBluestein)
			res = VkMulComplex(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp);
		else
			res = VkMulComplexConj(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp);
		if (res != VKFFT_SUCCESS) return res;

		if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) {
			if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) {
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride] = %s;\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->regIDs[i]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) {
			sc->tempLen = sprintf(sc->tempStr, "		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) {
			sc->tempLen = sprintf(sc->tempStr, "		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		switch (strideType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		{
			if (sc->localSize[0] * ((1 + i)) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		{
			if (sc->localSize[1] * ((1 + i)) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		}
	}
	res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}

static inline VkFFTResult appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t strided) {
	VkFFTResult res = VKFFT_SUCCESS;
	long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
	char vecType[30];
	char LFending[4] = "";
	char tempNum[100] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif
	char stageNormalization[50] = "";
	uint64_t normalizationValue = 1;
	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) {
		if ((sc->performDCT) && (sc->actualInverse)) {
			if (sc->performDCT == 1)
				normalizationValue = (sc->sourceFFTSize - 1) * 2;
			else
				normalizationValue = sc->sourceFFTSize * 2;
		}
		else
			normalizationValue = sc->sourceFFTSize;
	}
	if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) {
		normalizationValue *= sc->fft_dim_full;
	}
	if (normalizationValue != 1) {
		sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending);
	}
	char convolutionInverse[10] = "";
	sc->useCoalescedLUTUploadToSM = 0;
	if (sc->convolutionStep) {
		if (stageAngle < 0)
			sprintf(convolutionInverse, ", 0");
		else
			sprintf(convolutionInverse, ", 1");
	}
	res = appendBarrierVkFFT(sc, 1);
	if (res != VKFFT_SUCCESS) return res;


	res = appendZeropadStart(sc);
	if (res != VKFFT_SUCCESS) return res;
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;
	//rotate the stage
	char* gl_LocalInvocationID = (strided) ? sc->gl_LocalInvocationID_y : sc->gl_LocalInvocationID_x;
	if (stageSize > 1) {
		uint64_t num_logical_subgroups = (strided) ? sc->localSize[1] : sc->localSize[0];
		uint64_t num_logical_groups = (uint64_t)ceil((sc->fftDim) / (double)(num_logical_subgroups));
		for (uint64_t t = 0; t < num_logical_groups; t++) {
			if (((1 + t) * num_logical_subgroups) > sc->fftDim) {
				uint64_t current_group_cut = sc->fftDim - t * num_logical_subgroups;
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", gl_LocalInvocationID, current_group_cut);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, gl_LocalInvocationID, t * num_logical_subgroups, stageSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			if (sc->LUT)
				sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum);
			else
				sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->LUT) {
				sc->tempLen = sprintf(sc->tempStr, "		%s = twiddleLUT[LUTId+(%s+ %" PRIu64 ") / %" PRIu64 "];\n\n", sc->w, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (!sc->inverse) {
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = -%s.y;\n", sc->w, sc->w);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			else {
				if (!strcmp(floatType, "float")) {
					sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
				}
				if (!strcmp(floatType, "double")) {
					sc->tempLen = sprintf(sc->tempStr, "		%s = sincos_20(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w);
			//res = VkAppendLine(sc);
			//if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ");\n", sc->sdataID, gl_LocalInvocationID, t * num_logical_subgroups);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			if (sc->resolveBankConflictFirstStages == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (strided) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				if (sc->localSize[1] > 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[0], sc->sdataID);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0);
			if (res != VKFFT_SUCCESS) return res;

			sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->temp);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (((1 + t) * num_logical_subgroups) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadStart(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
	}
	uint64_t raderTranspose = ((sc->currentRaderContainer->containerFFTNum < 8) || (sc->currentRaderContainer->numStages == 1) || (strided)) ? 0 : 1;

	// read x0 - to be used in the end
	{
		uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[0];
		uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		//uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
		uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
		//uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
		uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix 

		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s;\n", sc->sdataID, sc->raderIDx2);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		if (strided) {
			sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->localSize[1] > 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
	%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		sc->tempLen = sprintf(sc->tempStr, "\
	%s = sdata[%s];\n", sc->x0[0], sc->sdataID);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		sc->tempLen = sprintf(sc->tempStr, "\
	}\n");
		res = VkAppendLine(sc);
	}
	// read x0 for x0+x1 - 0-element
	{
		uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1];
		uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		//uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
		uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
		//uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
		uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix 
		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}

		sc->tempLen = sprintf(sc->tempStr, "\
		if (%s == 0) {\n", sc->raderIDx);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s;\n", sc->sdataID, sc->raderIDx2);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		if (strided) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->localSize[1] > 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->x0[1], sc->sdataID);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	if (sc->currentRaderContainer->numStages == 1) {
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadStart(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
	}
	uint64_t locStageSize = 1;
	uint64_t locStageSizeSum = 0;
	long double locStageAngle = -double_PI;
	uint64_t shift = 0;
	for (uint64_t rader_stage = 0; rader_stage < sc->currentRaderContainer->numStages; rader_stage++) {
		uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage];
		uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
		uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
		uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
		uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix 
		uint64_t locFFTDimStride = locFFTDim;
		if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift;
		//local radix
		if ((rader_stage == 0) || (!raderTranspose)) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t k = 0; k < sc->registerBoost; k++) {
			if ((rader_stage == 0) || (!raderTranspose)) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
				if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue;
				if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
					uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize;

					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx, (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize, locStageSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->LUT)
					sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID + %" PRIu64 ";\n", locStageSizeSum + sc->currentRaderContainer->RaderRadixOffsetLUT);
				else
					sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)(locStageAngle), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 0; i < locStageRadix; i++) {
					uint64_t g = sc->currentRaderContainer->generator;
					if (rader_stage == 0) {
						if (sc->inline_rader_g_pow == 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%s + %" PRIu64 "];\n", sc->sdataID, stageRadix, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else if (sc->inline_rader_g_pow == 2) {
							sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%s + %" PRIu64 "];\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->currentRaderContainer->raderUintLUToffset);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%s + %" PRIu64 ");\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, sc->sdataID, sc->currentRaderContainer->containerFFTNum);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (!raderTranspose) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 " + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->fftDim / stageRadix, sc->raderIDx2, locFFTDimStride);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s + %" PRIu64 ")*%" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->currentRaderContainer->containerFFTNum, sc->raderIDx2, sc->fftDim / stageRadix);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

					uint64_t id = j + i * logicalRegistersPerThread / locStageRadix;
					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
					if (!strided) {
						if (sc->resolveBankConflictFirstStages == 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (strided) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->localSize[1] > 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[id], sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				char** regID = (char**)malloc(sizeof(char*) * locStageRadix);
				if (regID) {
					for (uint64_t i = 0; i < locStageRadix; i++) {
						regID[i] = (char*)malloc(sizeof(char) * 50);
						if (!regID[i]) {
							for (uint64_t p = 0; p < i; p++) {
								free(regID[p]);
								regID[p] = 0;
							}
							free(regID);
							regID = 0;
							return VKFFT_ERROR_MALLOC_FAILED;
						}
						uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(regID[i], "%s", sc->regIDs[id]);
					}
					res = inlineRadixKernelVkFFT(sc, floatType, uintType, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < locStageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(sc->regIDs[id], "%s", regID[i]);
					}
					for (uint64_t i = 0; i < locStageRadix; i++) {
						free(regID[i]);
						regID[i] = 0;
					}
					free(regID);
					regID = 0;
				}
				else
					return VKFFT_ERROR_MALLOC_FAILED;

				if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (rader_stage != sc->currentRaderContainer->numStages - 1) {
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
		}
		//local shuffle
		char** tempID;
		tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
		if (tempID) {
			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
				tempID[i] = (char*)malloc(sizeof(char) * 50);
				if (!tempID[i]) {
					for (uint64_t j = 0; j < i; j++) {
						free(tempID[j]);
						tempID[j] = 0;
					}
					free(tempID);
					tempID = 0;
					return VKFFT_ERROR_MALLOC_FAILED;
				}
			}
			for (uint64_t k = 0; k < sc->registerBoost; ++k) {
				uint64_t t = 0;

				if ((rader_stage == 0) || (!raderTranspose)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				//last stage - save x1
				if (rader_stage == sc->currentRaderContainer->numStages - 1) {

					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s == 0) {\n", sc->raderIDx);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = VkAddComplex(sc, sc->x0[1], sc->x0[1], sc->regIDs[0]);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strided) {
					if (rader_stage != 0) {
						shift = (subLogicalGroupSize > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - locFFTDim % (sc->numSharedBanks / 2) : 0;
						if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift;
					}
					else {
						if (sc->sharedShiftRaderFFT > 0) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
							if (res != VKFFT_SUCCESS) return res;
							res = appendZeropadEnd(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "\
		sharedStride = %" PRIu64 ";\n", sc->sharedStrideRaderFFT);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							res = appendZeropadStart(sc);
							if (res != VKFFT_SUCCESS) return res;
							res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
							if (res != VKFFT_SUCCESS) return res;
							if ((rader_stage == 0) || (!raderTranspose)) {
								sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						shift = ((locFFTDim % (sc->numSharedBanks / 2))) ? 0 : 1;
						if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift;
					}
				}
				for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
					if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
						if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
							uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize;
							sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sprintf(tempNum, "%" PRIu64 "", j * subLogicalGroupSize);
						res = VkAddReal(sc, sc->stageInvocationID, sc->raderIDx, tempNum);
						if (res != VKFFT_SUCCESS) return res;
						res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
						if (res != VKFFT_SUCCESS) return res;
						sprintf(tempNum, "%" PRIu64 "", locStageSize);
						res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
						if (res != VKFFT_SUCCESS) return res;
						res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID);
						if (res != VKFFT_SUCCESS) return res;
						sprintf(tempNum, "%" PRIu64 "", locStageRadix);
						res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID);
						if (res != VKFFT_SUCCESS) return res;
					}
					/*sc->tempLen = sprintf(sc->tempStr, "\
	stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\
	blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
	inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/

					for (uint64_t i = 0; i < locStageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
						t++;
						if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
							sprintf(tempNum, "%" PRIu64 "", i * locStageSize);
							res = VkAddReal(sc, sc->combinedID, sc->inoutID, tempNum);
							if (res != VKFFT_SUCCESS) return res;

							//last stage - mult rader kernel
							if (rader_stage == sc->currentRaderContainer->numStages - 1) {
								if (sc->inline_rader_kernel) {
									sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = r_rader_kernel_%" PRIu64 "[%s];\n\
		%s.y = i_rader_kernel_%" PRIu64 "[%s];\n", sc->w, stageRadix, sc->combinedID, sc->w, stageRadix, sc->combinedID);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
								}
								else {
									sc->tempLen = sprintf(sc->tempStr, "\
		%s = twiddleLUT[%s+%" PRIu64 "];\n", sc->w, sc->combinedID, sc->currentRaderContainer->RaderKernelOffsetLUT);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
								}
								/*sc->tempLen = sprintf(sc->tempStr, "\
		printf(\"%%f %%f - %%f %%f\\n\", %s.x, %s.y, %s.x, %s.y);\n", sc->regIDs[id], sc->regIDs[id], sc->w, sc->w);
					res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;*/
								res = VkMulComplex(sc, sc->regIDs[id], sc->regIDs[id], sc->w, sc->temp);
								if (res != VKFFT_SUCCESS) return res;
							}
							if (rader_stage != sc->currentRaderContainer->numStages - 1) {
								if (!raderTranspose) {
									sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix);
									res = VkAddReal(sc, sc->sdataID, sc->combinedID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%s * %" PRIu64 "", sc->raderIDx2, locFFTDimStride);
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
								}
								else {
									sprintf(tempNum, "%" PRIu64 "", sc->currentRaderContainer->containerFFTNum);
									res = VkMulReal(sc, sc->sdataID, sc->combinedID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix);
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%s", sc->raderIDx2);
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
								}
								if (!strided) {
									if (0 && (locStageSize <= sc->numSharedBanks / 2) && (locFFTsCombined > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != locFFTDim / sc->registerBoost) && ((locFFTDim & (locFFTDim - 1)) == 0) && (locStageSize * locStageRadix != locFFTDim)) {
										if (sc->resolveBankConflictFirstStages == 0) {
											sc->resolveBankConflictFirstStages = 1;
											sc->tempLen = sprintf(sc->tempStr, "\
	%s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
										sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;

									}
									else {
										if (sc->resolveBankConflictFirstStages == 1) {
											sc->resolveBankConflictFirstStages = 0;
											sc->tempLen = sprintf(sc->tempStr, "\
	%s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
									}
								}
								if (strided) {
									res = VkMulReal(sc, sc->sdataID, sc->sdataID, sc->sharedStride);
									if (res != VKFFT_SUCCESS) return res;
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
									if (res != VKFFT_SUCCESS) return res;
								}
								else {
									if (sc->localSize[1] > 1) {
										res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride);
										if (res != VKFFT_SUCCESS) return res;
										res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
								//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
								res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						/*sc->tempLen = sprintf(sc->tempStr, "\
sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
					}
					if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
						if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
					sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]);
					t++;
				}
				t = 0;
			}
			if (rader_stage != sc->currentRaderContainer->numStages - 1) {
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					//printf("0 - %s\n", resID[i]);
					sprintf(sc->regIDs[i], "%s", tempID[i]);
					//sprintf(resID[i], "%s", tempID[i]);
					//printf("1 - %s\n", resID[i]);
				}
			}
			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
				free(tempID[i]);
				tempID[i] = 0;
			}
			free(tempID);
			tempID = 0;
		}
		else
			return VKFFT_ERROR_MALLOC_FAILED;

		if (rader_stage > 0) {
			switch (locStageRadix) {
			case 2:
				locStageSizeSum += locStageSize;
				break;
			case 3:
				locStageSizeSum += locStageSize * 2;
				break;
			case 4:
				locStageSizeSum += locStageSize * 2;
				break;
			case 5:
				locStageSizeSum += locStageSize * 4;
				break;
			case 6:
				locStageSizeSum += locStageSize * 5;
				break;
			case 7:
				locStageSizeSum += locStageSize * 6;
				break;
			case 8:
				locStageSizeSum += locStageSize * 3;
				break;
			case 9:
				locStageSizeSum += locStageSize * 8;
				break;
			case 10:
				locStageSizeSum += locStageSize * 9;
				break;
			case 11:
				locStageSizeSum += locStageSize * 10;
				break;
			case 12:
				locStageSizeSum += locStageSize * 11;
				break;
			case 13:
				locStageSizeSum += locStageSize * 12;
				break;
			case 14:
				locStageSizeSum += locStageSize * 13;
				break;
			case 15:
				locStageSizeSum += locStageSize * 14;
				break;
			case 16:
				locStageSizeSum += locStageSize * 4;
				break;
			case 32:
				locStageSizeSum += locStageSize * 5;
				break;
			default:
				locStageSizeSum += locStageSize * (locStageRadix);
				break;
			}
		}
		locStageSize *= locStageRadix;
		locStageAngle /= locStageRadix;

		if (rader_stage != sc->currentRaderContainer->numStages - 1) {
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
		}
	}

	//iFFT
	locStageSize = 1;
	locStageAngle = double_PI;
	locStageSizeSum = 0;
	for (int64_t rader_stage = sc->currentRaderContainer->numStages - 1; rader_stage >= 0; rader_stage--) {
		uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage];
		uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
		uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
		uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
		uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix 
		uint64_t locFFTDimStride = locFFTDim; //different length due to all -1 cutoffs
		if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift;
		//local radix
		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t k = 0; k < sc->registerBoost; k++) {
			if (!raderTranspose) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
				if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue;
				if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
					uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize;
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx, (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize, locStageSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->LUT)
					sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID + %" PRIu64 ";\n", locStageSizeSum + sc->currentRaderContainer->RaderRadixOffsetLUTiFFT);
				else
					sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)(locStageAngle), LFending);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) {
					for (uint64_t i = 0; i < locStageRadix; i++) {
						uint64_t id = j + i * logicalRegistersPerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						if (!raderTranspose) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s + %" PRIu64 ") + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->fftDim / stageRadix, sc->raderIDx2, locFFTDimStride);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s + %" PRIu64 ")*%" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->currentRaderContainer->containerFFTNum, sc->raderIDx2, sc->fftDim / stageRadix);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (!strided) {
							if (sc->resolveBankConflictFirstStages == 1) {
								sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						if (strided) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							if (sc->localSize[1] > 1) {
								sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[id], sc->sdataID);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				char** regID = (char**)malloc(sizeof(char*) * locStageRadix);
				if (regID) {
					for (uint64_t i = 0; i < locStageRadix; i++) {
						regID[i] = (char*)malloc(sizeof(char) * 50);
						if (!regID[i]) {
							for (uint64_t p = 0; p < i; p++) {
								free(regID[p]);
								regID[p] = 0;
							}
							free(regID);
							regID = 0;
							return VKFFT_ERROR_MALLOC_FAILED;
						}
						uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(regID[i], "%s", sc->regIDs[id]);
					}
					res = inlineRadixKernelVkFFT(sc, floatType, uintType, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < locStageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(sc->regIDs[id], "%s", regID[i]);
					}
					for (uint64_t i = 0; i < locStageRadix; i++) {
						free(regID[i]);
						regID[i] = 0;
					}
					free(regID);
					regID = 0;
				}
				else
					return VKFFT_ERROR_MALLOC_FAILED;
				if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
		if (!strided) {
			if (rader_stage == 0) {
				if (sc->sharedStrideRaderFFT > 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
		sharedStride = %" PRIu64 ";\n", sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		res = appendZeropadStart(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
		//local shuffle
		char** tempID;
		tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
		if (tempID) {
			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
				tempID[i] = (char*)malloc(sizeof(char) * 50);
				if (!tempID[i]) {
					for (uint64_t j = 0; j < i; j++) {
						free(tempID[j]);
						tempID[j] = 0;
					}
					free(tempID);
					tempID = 0;
					return VKFFT_ERROR_MALLOC_FAILED;
				}
			}
			for (uint64_t k = 0; k < sc->registerBoost; ++k) {
				uint64_t t = 0;
				if (!raderTranspose) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (rader_stage == 0) {
					res = VkMovReal(sc, sc->stageInvocationID, sc->raderIDx2);
					if (res != VKFFT_SUCCESS) return res;
					res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(tempNum, "%" PRIu64 "", stageSize);
					res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
					if (res != VKFFT_SUCCESS) return res;
					res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(tempNum, "%" PRIu64 "", stageRadix);
					res = VkMulReal(sc, sc->raderIDx2, sc->blockInvocationID, tempNum);
					if (res != VKFFT_SUCCESS) return res;
					res = VkAddReal(sc, sc->raderIDx2, sc->raderIDx2, sc->stageInvocationID);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (!strided) {
					if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) {
						shift = (subLogicalGroupSize > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - locFFTDim % (sc->numSharedBanks / 2) : 0;
						if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift;
					}
					else {
						shift = ((locFFTDim % (sc->numSharedBanks / 2))) ? 0 : 1;
						if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift;
					}
				}
				for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) {
					if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
						if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
							uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize;
							sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sprintf(tempNum, "%" PRIu64 "", j * subLogicalGroupSize);
						res = VkAddReal(sc, sc->stageInvocationID, sc->raderIDx, tempNum);
						if (res != VKFFT_SUCCESS) return res;
						res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
						if (res != VKFFT_SUCCESS) return res;
						sprintf(tempNum, "%" PRIu64 "", locStageSize);
						res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
						if (res != VKFFT_SUCCESS) return res;
						res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID);
						if (res != VKFFT_SUCCESS) return res;
						sprintf(tempNum, "%" PRIu64 "", locStageRadix);
						res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID);
						if (res != VKFFT_SUCCESS) return res;

					}
					/*sc->tempLen = sprintf(sc->tempStr, "\
	stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\
	blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
	inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/

					for (uint64_t i = 0; i < locStageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
						t++;
						if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
							sprintf(tempNum, "%" PRIu64 "", i * locStageSize);
							res = VkAddReal(sc, sc->combinedID, sc->inoutID, tempNum);
							if (res != VKFFT_SUCCESS) return res;

							if (rader_stage == 0) {
								locFFTDimStride = locFFTDim;
								//last stage - add x0

								uint64_t g = sc->currentRaderContainer->generator;
								if (sc->inline_rader_g_pow == 1) {
									sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->combinedID, stageRadix, stageRadix - 1, sc->combinedID);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
								}
								else if (sc->inline_rader_g_pow == 2) {
									sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%" PRIu64 "-%s];\n", sc->combinedID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->combinedID);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
								}
								else {
									sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%" PRIu64 "-%s);\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, stageRadix - 1, sc->combinedID, sc->sdataID, sc->inoutID, sc->combinedID, sc->combinedID, g, stageRadix, sc->inoutID);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
								}
								if (sc->inverse) {
									sprintf(tempNum, "(%" PRIu64 "-%s)*%" PRIu64 "", (stageRadix), sc->combinedID, stageSize);
								}
								else {
									sprintf(tempNum, "%s*%" PRIu64 "", sc->combinedID, stageSize);
								}
								res = VkAddReal(sc, sc->sdataID, sc->raderIDx2, tempNum);
								if (res != VKFFT_SUCCESS) return res;
								//normalization is in kernel
								/*sprintf(tempNum, "%.17e%s", 1.0 / locFFTDim, LFending);
								res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], tempNum);
								if (res != VKFFT_SUCCESS) return res;*/
								res = VkAddComplex(sc, sc->regIDs[id], sc->regIDs[id], sc->x0[0]);
								if (res != VKFFT_SUCCESS) return res;

							}
							else {
								if (!raderTranspose) {
									sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix);
									res = VkAddReal(sc, sc->sdataID, sc->combinedID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%s * %" PRIu64 "", sc->raderIDx2, locFFTDimStride);
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
								}
								else {
									sprintf(tempNum, "%" PRIu64 "", sc->currentRaderContainer->containerFFTNum);
									res = VkMulReal(sc, sc->sdataID, sc->combinedID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix);
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%s", sc->raderIDx2);
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
								}
							}
							if (!strided) {
								if (0 && (locStageSize <= sc->numSharedBanks / 2) && (locFFTsCombined > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != locFFTDim / sc->registerBoost) && ((locFFTDim & (locFFTDim - 1)) == 0) && (locStageSize * locStageRadix != locFFTDim)) {
									if (sc->resolveBankConflictFirstStages == 0) {
										sc->resolveBankConflictFirstStages = 1;
										sc->tempLen = sprintf(sc->tempStr, "\
	%s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
									sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;

								}
								else {
									if (sc->resolveBankConflictFirstStages == 1) {
										sc->resolveBankConflictFirstStages = 0;
										sc->tempLen = sprintf(sc->tempStr, "\
	%s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;
									}
								}
							}
							if (strided) {
								res = VkMulReal(sc, sc->sdataID, sc->sdataID, sc->sharedStride);
								if (res != VKFFT_SUCCESS) return res;
								res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								if (sc->localSize[1] > 1) {
									res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride);
									if (res != VKFFT_SUCCESS) return res;
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
									if (res != VKFFT_SUCCESS) return res;
								}
							}
							//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
							if ((((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) && (rader_stage == 0)) {
								if (strcmp(stageNormalization, "")) {
									res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
								}
								if (res != VKFFT_SUCCESS) return res;
							}
							res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->sdataID, sc->regIDs[id], sc->regIDs[id]);
							//res = VkAppendLine(sc);
							//if (res != VKFFT_SUCCESS) return res;
						}
						/*sc->tempLen = sprintf(sc->tempStr, "\
sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
					}
					if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) {
						if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) {
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
					sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]);
					t++;
				}
				t = 0;
			}
			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
				//printf("0 - %s\n", resID[i]);
				sprintf(sc->regIDs[i], "%s", tempID[i]);
				//sprintf(resID[i], "%s", tempID[i]);
				//printf("1 - %s\n", resID[i]);
			}
			for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
				free(tempID[i]);
				tempID[i] = 0;
			}
			free(tempID);
			tempID = 0;
		}
		else
			return VKFFT_ERROR_MALLOC_FAILED;

		if (rader_stage < (int64_t)sc->currentRaderContainer->numStages - 1) {
			switch (locStageRadix) {
			case 2:
				locStageSizeSum += locStageSize;
				break;
			case 3:
				locStageSizeSum += locStageSize * 2;
				break;
			case 4:
				locStageSizeSum += locStageSize * 2;
				break;
			case 5:
				locStageSizeSum += locStageSize * 4;
				break;
			case 6:
				locStageSizeSum += locStageSize * 5;
				break;
			case 7:
				locStageSizeSum += locStageSize * 6;
				break;
			case 8:
				locStageSizeSum += locStageSize * 3;
				break;
			case 9:
				locStageSizeSum += locStageSize * 8;
				break;
			case 10:
				locStageSizeSum += locStageSize * 9;
				break;
			case 11:
				locStageSizeSum += locStageSize * 10;
				break;
			case 12:
				locStageSizeSum += locStageSize * 11;
				break;
			case 13:
				locStageSizeSum += locStageSize * 12;
				break;
			case 14:
				locStageSizeSum += locStageSize * 13;
				break;
			case 15:
				locStageSizeSum += locStageSize * 14;
				break;
			case 16:
				locStageSizeSum += locStageSize * 4;
				break;
			case 32:
				locStageSizeSum += locStageSize * 5;
				break;
			default:
				locStageSizeSum += locStageSize * (locStageRadix);
				break;
			}
		}
		locStageSize *= locStageRadix;
		locStageAngle /= locStageRadix;
		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadStart(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
	}

	{
		uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1];
		uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		//uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
		uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs
		//uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim;
		//uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread);
		uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix 
		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (!raderTranspose) {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		sc->tempLen = sprintf(sc->tempStr, "\
		if (%s == 0) {\n", sc->raderIDx);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		res = VkMovReal(sc, sc->stageInvocationID, sc->raderIDx2);
		if (res != VKFFT_SUCCESS) return res;
		res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
		if (res != VKFFT_SUCCESS) return res;
		sprintf(tempNum, "%" PRIu64 "", stageSize);
		res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
		if (res != VKFFT_SUCCESS) return res;
		res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID);
		if (res != VKFFT_SUCCESS) return res;
		sprintf(tempNum, "%" PRIu64 "", stageRadix);
		res = VkMulReal(sc, sc->raderIDx2, sc->blockInvocationID, tempNum);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAddReal(sc, sc->raderIDx2, sc->raderIDx2, sc->stageInvocationID);
		if (res != VKFFT_SUCCESS) return res;

		sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s;\n", sc->sdataID, sc->raderIDx2);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		if (strided) {
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		else {
			if (sc->localSize[1] > 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
			if (strcmp(stageNormalization, "")) {
				res = VkMulComplexNumber(sc, sc->x0[1], sc->x0[1], stageNormalization);
			}
			if (res != VKFFT_SUCCESS) return res;
		}

		sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->x0[1]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
	}
	return res;
}
static inline VkFFTResult appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t strided) {
	VkFFTResult res = VKFFT_SUCCESS;
	long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
	char vecType[30];
	char LFending[4] = "";
	char tempNum[50] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif
	char stageNormalization[50] = "";
	uint64_t normalizationValue = 1;
	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) {
		if ((sc->performDCT) && (sc->actualInverse)) {
			if (sc->performDCT == 1)
				normalizationValue = (sc->sourceFFTSize - 1) * 2;
			else
				normalizationValue = sc->sourceFFTSize * 2;
		}
		else
			normalizationValue = sc->sourceFFTSize;
	}
	if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) {
		normalizationValue *= sc->fft_dim_full;
	}
	if (normalizationValue != 1) {
		sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending);
	}
	char convolutionInverse[10] = "";
	if (sc->convolutionStep) {
		if (stageAngle < 0)
			sprintf(convolutionInverse, ", 0");
		else
			sprintf(convolutionInverse, ", 1");
	}
	res = appendBarrierVkFFT(sc, 1);
	if (res != VKFFT_SUCCESS) return res;
	res = appendZeropadStart(sc);
	if (res != VKFFT_SUCCESS) return res;
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;

	uint64_t num_logical_subgroups = (strided) ? sc->localSize[1] / ((stageRadix + 1) / 2) : sc->localSize[0] / ((stageRadix + 1) / 2);
	uint64_t num_logical_groups = (uint64_t)ceil((sc->fftDim / stageRadix) / (double)(num_logical_subgroups));
	uint64_t require_cutoff_check = ((sc->fftDim == (num_logical_subgroups * num_logical_groups * stageRadix))) ? 0 : 1;
	uint64_t require_cutoff_check2;
	char* gl_LocalInvocationID = (strided) ? sc->gl_LocalInvocationID_y : sc->gl_LocalInvocationID_x;

	if (strided) {
		require_cutoff_check2 = ((sc->localSize[1] % ((stageRadix + 1) / 2)) == 0) ? 0 : 1;
	}
	else {
		require_cutoff_check2 = ((sc->localSize[0] % ((stageRadix + 1) / 2)) == 0) ? 0 : 1;
	}
	sc->tempLen = sprintf(sc->tempStr, "	%s= %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, (stageRadix + 1) / 2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	sc->tempLen = sprintf(sc->tempStr, "	%s= %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, (stageRadix + 1) / 2);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;

	for (uint64_t k = 0; k < sc->registerBoost; k++) {
		for (uint64_t j = 0; j < 1; j++) {
			if (stageSize > 1) {
				if (require_cutoff_check2) {
					if (strided) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				for (uint64_t t = 0; t < num_logical_groups; t++) {
					if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx2, t * num_logical_subgroups, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->LUT)
						sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum);
					else
						sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "		%s = twiddleLUT[LUTId+%s];\n\n", sc->w, sc->raderIDx);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (!sc->inverse) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = -%s.y;\n", sc->w, sc->w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s(angle*%.17e%s*(%s));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, sc->raderIDx);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s(angle*%.17e%s*(%s));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, sc->raderIDx);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		%s = sincos_20(angle*%.17e%s*(%s));\n", sc->w, 2.0 / stageRadix, LFending, sc->raderIDx);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (strided) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->localSize[1] > 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[0], sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->temp);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx2, t * num_logical_subgroups, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->LUT)
						sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum);
					else
						sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "		%s = twiddleLUT[LUTId+%s+%" PRIu64 "];\n\n", sc->w, sc->raderIDx, (stageRadix + 1) / 2);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (!sc->inverse) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = -%s.y;\n", sc->w, sc->w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					else {
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		%s = sincos_20(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w);
					//res = VkAppendLine(sc);
					//if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%" PRIu64 " + %s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, (stageRadix + 1) / 2, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (strided) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->localSize[1] > 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[0], sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0);
					if (res != VKFFT_SUCCESS) return res;

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->temp);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (require_cutoff_check2) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 1);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (require_cutoff_check2) {
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			//save x0
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (strided) {
					if (sc->localSize[0] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s + %" PRIu64 ") * sharedStride + %s;\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				else {
					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 " + sharedStride * %s;\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->x0[t], sc->sdataID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			//generator index + shuffle 
			sc->tempLen = sprintf(sc->tempStr, "\
		if(%s>0){\n", sc->raderIDx);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			uint64_t g = sc->currentRaderContainer->generator;
			if (sc->inline_rader_g_pow == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%s-1];\n", sc->sdataID, stageRadix, sc->raderIDx);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else if (sc->inline_rader_g_pow == 2) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%s-1+%" PRIu64 "];\n", sc->sdataID, sc->raderIDx, sc->currentRaderContainer->raderUintLUToffset);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%s-1);\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				sc->tempLen = sprintf(sc->tempStr, "\
			%s = sdata[%s];\n", sc->regIDs[t * 2], sc->combinedID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (sc->inline_rader_g_pow == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%s+ %" PRIu64 "];\n", sc->sdataID, stageRadix, sc->raderIDx, (stageRadix - 1) / 2 - 1);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else if (sc->inline_rader_g_pow == 2) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%s+ %" PRIu64 "];\n", sc->sdataID, sc->raderIDx, (stageRadix - 1) / 2 - 1 + sc->currentRaderContainer->raderUintLUToffset);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%s+ %" PRIu64 ");\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, sc->raderIDx, (stageRadix - 1) / 2 - 1, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}

			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				sc->tempLen = sprintf(sc->tempStr, "\
			%s = sdata[%s];\n", sc->regIDs[2 * t + 1], sc->combinedID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			if (require_cutoff_check2) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}

			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			//load deconv kernel
			if (!sc->inline_rader_kernel) {
				for (uint64_t t = 0; t < (uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))); t++) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %s *  %" PRIu64 " + %" PRIu64 ";\n", sc->combinedID, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->localSize[0], t * sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (t == ((uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))) - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->combinedID, (stageRadix - 1));
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (sc->LUT) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = twiddleLUT[%s+%" PRIu64 "];\n", sc->w, sc->combinedID, sc->currentRaderContainer->RaderKernelOffsetLUT);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->inverse) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.y = -%s.y;\n", sc->w, sc->w);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s+%" PRIu64 "] = %s;\n", sc->combinedID, sc->RaderKernelOffsetShared[stageID], sc->w);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->inline_rader_g_pow == 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%" PRIu64 " - %s];\n", sc->sdataID, stageRadix, stageRadix - 1, sc->combinedID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else if (sc->inline_rader_g_pow == 2) {
							sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%" PRIu64 " - %s];\n", sc->sdataID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->combinedID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%" PRIu64 " - %s);\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, stageRadix - 1, sc->combinedID, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (!strcmp(floatType, "float")) {
							sc->tempLen = sprintf(sc->tempStr, "		%s.x = %s(%.17e%s*%s);\n", sc->w, cosDef, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (sc->inverse) {
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = %s(%.17e%s*%s);\n", sc->w, sinDef, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = -%s(%.17e%s*%s);\n", sc->w, sinDef, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							//sc->tempLen = sprintf(sc->tempStr, "	w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
						}
						if (!strcmp(floatType, "double")) {
							sc->tempLen = sprintf(sc->tempStr, "		%s = sincos_20(%.17e%s*%s);\n", sc->w, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
							if (!sc->inverse) {
								sc->tempLen = sprintf(sc->tempStr, "		%s.y = -%s.y;\n", sc->w, sc->w);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s+%" PRIu64 "] = %s;\n", sc->combinedID, sc->RaderKernelOffsetShared[stageID], sc->w);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					if (t == ((uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))) - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
			}
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;

			if (require_cutoff_check2) {
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			//x0 is ready

			//no subgroups
			/* {
				sc->tempLen = sprintf(sc->tempStr, "\
		if(%s==0){\n", sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = 0;\n\
		%s.y = 0;\n", sc->regIDs[0], sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = 0;\n", sc->combinedID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if (sc->localSize[1] > 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * (%s);\n", sc->sdataID, sc->combinedID, sc->gl_LocalInvocationID_y);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		while(%s<%" PRIu64 "){\n\
		%s.x += sdata[%s].x;\n\
		%s.y += sdata[%s].y;\n\
		%s++; %s++;}\n", sc->combinedID, stageRadix, sc->regIDs[0], sc->sdataID, sc->regIDs[0], sc->sdataID, sc->combinedID, sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		while(%s<%" PRIu64 "){\n\
		%s.x += sdata[%s].x;\n\
		%s.y += sdata[%s].y;\n\
		%s++;}\n", sc->combinedID, stageRadix, sc->regIDs[0], sc->combinedID, sc->regIDs[0], sc->combinedID, sc->combinedID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = 0;\n", sc->sdataID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if (sc->localSize[1] > 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * (%s);\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}*/
			//subgroups
			/* {
				uint64_t numGroupsQuant = ((((sc->localSize[0] * sc->localSize[1] * sc->localSize[2]) % sc->warpSize) == 0) || (sc->numSubgroups == 1)) ? sc->numSubgroups : sc->numSubgroups - 1;
				if (numGroupsQuant != sc->numSubgroups) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, numGroupsQuant);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				for (uint64_t t = 0; t < (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant); t++) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = 0;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = 0;\n", sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					uint64_t quant = (sc->warpSize < (sc->localSize[0] * sc->localSize[1] * sc->localSize[2])) ? sc->warpSize : (sc->localSize[0] * sc->localSize[1] * sc->localSize[2]);
					for (uint64_t t2 = 0; t2 < (uint64_t)ceil(stageRadix / (double)quant); t2++) {
						if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupInvocationID, stageRadix % quant);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+%" PRIu64 ") * %" PRIu64 ";\n", sc->sdataID, sc->gl_SubgroupInvocationID, t2 * quant, sc->fftDim / stageRadix);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (sc->localSize[1] > 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * (%s+%" PRIu64 ");\n", sc->sdataID, sc->sdataID, sc->gl_SubgroupID, t * numGroupsQuant);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[1], sc->sdataID);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAddComplex(sc, sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]);
						if (res != VKFFT_SUCCESS) return res;
						if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

					res = VkSubgroupAdd(sc, sc->regIDs[0], sc->regIDs[0], 1);
					if (res != VKFFT_SUCCESS) return res;

					if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s==0){\n", sc->gl_SubgroupInvocationID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = 0;\n", sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * (%s+%" PRIu64 ");\n", sc->sdataID, sc->sdataID, sc->gl_SubgroupID, t * numGroupsQuant);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[0]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (numGroupsQuant != sc->numSubgroups) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}*/

			sc->tempLen = sprintf(sc->tempStr, "\
		if(%s > 0){\n", sc->raderIDx);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 " * sharedStride;\n", sc->combinedID, sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = %s.x - %s.x;\n", sc->temp, sc->regIDs[2 * t], sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.x += %s.x;\n", sc->regIDs[2 * t], sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = %s.y + %s.y;\n", sc->temp, sc->regIDs[2 * t], sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.y -= %s.y;\n", sc->regIDs[2 * t], sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[2 * t]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = %s;\n", sc->combinedID, sc->temp);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->regIDs[0], sc->regIDs[0], sc->temp, sc->temp);
			//res = VkAppendLine(sc);
			//if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			if (require_cutoff_check2) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (require_cutoff_check2) {
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}

			sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = 0;\n", sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.y = 0;\n", sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		if(%s == %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = 1; %s.y = 0;\n", sc->w, sc->w);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t i = 0; i < (stageRadix - 1) / 2; i++) {

				sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				sc->tempLen = sprintf(sc->tempStr, "\
		%s = ((%" PRIu64 "+%s) %% %" PRIu64 ");\n", sc->sdataID, stageRadix - 1 - i, sc->raderIDx, (stageRadix - 1));
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (sc->inline_rader_kernel) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = r_rader_kernel_%" PRIu64 "[%s];\n\
		%s.y = i_rader_kernel_%" PRIu64 "[%s];\n", sc->w, stageRadix, sc->sdataID, sc->w, stageRadix, sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s+%" PRIu64 "];\n", sc->w, sc->sdataID, sc->RaderKernelOffsetShared[stageID]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				for (uint64_t t = 0; t < num_logical_groups; t++) {
#if(VKFFT_BACKEND != 2) //AMD compiler fix
					if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
#endif
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s+ %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups + (1 + i) * sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (strided) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						if (sc->localSize[1] > 1) {
							sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[0], sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (strided) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s += %" PRIu64 "*sharedStride;\n", sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s += %" PRIu64 " ;\n", sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->temp, sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
#if(VKFFT_BACKEND == 2) //AMD compiler fix
					if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s>=%" PRIu64 "){%s.x =0;%s.y=0;%s.x=0;%s.y=0;}\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups, sc->temp, sc->temp, sc->regIDs[0], sc->regIDs[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
#endif
					sprintf(tempNum, "%s", sc->x0[t]);
					res = VkFMA3Complex(sc, tempNum, sc->regIDs[2 * t + 1], sc->regIDs[0], sc->w, sc->temp);
					if (res != VKFFT_SUCCESS) return res;
#if(VKFFT_BACKEND != 2) //AMD compiler fix
					if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
#endif
#if(VKFFT_BACKEND == 2) //AMD compiler fix
					if ((uint64_t)ceil((sc->localSize[0] * sc->localSize[1]) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) > 2048) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;

						if (require_cutoff_check2) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}

						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = appendBarrierVkFFT(sc, 1);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadStart(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
						if (res != VKFFT_SUCCESS) return res;

						if (require_cutoff_check2) {
							if (strided) {
								sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							else {
								sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
#endif
				}
#if(VKFFT_BACKEND == 2) //AMD compiler fix
				if ((uint64_t)ceil((sc->localSize[0] * sc->localSize[1]) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) <= 2048) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (require_cutoff_check2) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadEnd(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = appendBarrierVkFFT(sc, 1);
					if (res != VKFFT_SUCCESS) return res;
					res = appendZeropadStart(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
					if (res != VKFFT_SUCCESS) return res;

					if (require_cutoff_check2) {
						if (strided) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						else {
							sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
#endif
			}
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sprintf(tempNum, "%s", sc->x0[t]);

				sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = %s.x-%s.x;\n\
		%s.y = %s.y+%s.y;\n", sc->regIDs[2 * t], tempNum, sc->regIDs[2 * t + 1], sc->regIDs[2 * t], tempNum, sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s.x = %s.x+%s.x;\n\
		%s.y = %s.y-%s.y;\n", sc->regIDs[2 * t + 1], tempNum, sc->regIDs[2 * t + 1], sc->regIDs[2 * t + 1], tempNum, sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			if (require_cutoff_check2) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}

			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;

			if (require_cutoff_check2) {
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2));
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}

			sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "	printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->regIDs[1], sc->regIDs[1]);
			//res = VkAppendLine(sc);
			//if (res != VKFFT_SUCCESS) return res;
			if (sc->inline_rader_g_pow == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix, stageRadix - 1, sc->raderIDx);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else if (sc->inline_rader_g_pow == 2) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->raderIDx);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%" PRIu64 "-%s);\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, stageRadix - 1, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		}else{\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = 0;\n", sc->sdataID);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sprintf(tempNum, "%" PRIu64 "", t * num_logical_subgroups);
				res = VkAddReal(sc, sc->combinedID, sc->raderIDx2, tempNum);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(tempNum, "%" PRIu64 "", stageSize);
				res = VkModReal(sc, sc->stageInvocationID, sc->combinedID, tempNum);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubReal(sc, sc->blockInvocationID, sc->combinedID, sc->stageInvocationID);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(tempNum, "%" PRIu64 "", stageRadix);
				res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
				if (res != VKFFT_SUCCESS) return res;

				sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s + %s * %" PRIu64 " + %s;\n", sc->combinedID, sc->inoutID, sc->sdataID, stageSize, sc->stageInvocationID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
					if (strcmp(stageNormalization, "")) {
						res = VkMulComplexNumber(sc, sc->regIDs[2 * t], sc->regIDs[2 * t], stageNormalization);
					}
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s]=%s;\n", sc->combinedID, sc->regIDs[2 * t]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->inline_rader_g_pow == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix, (stageRadix - 1) / 2, sc->raderIDx);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else if (sc->inline_rader_g_pow == 2) {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= g_pow[%" PRIu64 "-%s];\n", sc->sdataID, (stageRadix - 1) / 2 + sc->currentRaderContainer->raderUintLUToffset, sc->raderIDx);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
			%s= (%" PRIu64 "-%s);\n\
			%s=1;\n\
			while (%s != 0)\n\
			{\n\
				%s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\
				%s--;\n\
			}\n", sc->inoutID, (stageRadix - 1) / 2, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t t = 0; t < num_logical_groups; t++) {
				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}

				sprintf(tempNum, "%" PRIu64 "", t * num_logical_subgroups);
				res = VkAddReal(sc, sc->combinedID, sc->raderIDx2, tempNum);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(tempNum, "%" PRIu64 "", stageSize);
				res = VkModReal(sc, sc->stageInvocationID, sc->combinedID, tempNum);
				if (res != VKFFT_SUCCESS) return res;
				res = VkSubReal(sc, sc->blockInvocationID, sc->combinedID, sc->stageInvocationID);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(tempNum, "%" PRIu64 "", stageRadix);
				res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s + %s * %" PRIu64 " + %s;\n", sc->combinedID, sc->inoutID, sc->sdataID, stageSize, sc->stageInvocationID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (strided) {
					sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
			%s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
					if (strcmp(stageNormalization, "")) {
						res = VkMulComplexNumber(sc, sc->regIDs[2 * t + 1], sc->regIDs[2 * t + 1], stageNormalization);
					}
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s]=%s;\n", sc->combinedID, sc->regIDs[2 * t + 1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;

				if ((require_cutoff_check) && (t == num_logical_groups - 1)) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (require_cutoff_check2) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
		}
	}

	return res;
}

static inline VkFFTResult appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif

	char convolutionInverse[10] = "";
	if (sc->convolutionStep) {
		if (stageAngle < 0)
			sprintf(convolutionInverse, ", 0");
		else
			sprintf(convolutionInverse, ", 1");
	}
	uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
	uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
	uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread);
	if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))
	{
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
	}
	res = appendZeropadStart(sc);
	if (res != VKFFT_SUCCESS) return res;
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;

	//upload second stage of LUT to sm
	uint64_t numLUTelementsStage = 0;
	switch (stageRadix) {
	case 2:
		numLUTelementsStage = 1;
		break;
	case 4:
		numLUTelementsStage = 2;
		break;
	case 8:
		numLUTelementsStage = 3;
		break;
	case 16:
		numLUTelementsStage = 4;
		break;
	case 32:
		numLUTelementsStage = 5;
		break;
	default:
		if (stageRadix < sc->fixMinRaderPrimeMult)
			numLUTelementsStage = stageRadix - 1;
		else
			numLUTelementsStage = stageRadix;
		break;
	}
	if ((sc->LUT) && (stageSize > 1) && ((((numLUTelementsStage >= 4) && (sc->fftDim >= 1024)) || (((numLUTelementsStage >= 3) && (sc->fftDim < 1024)))) || (logicalRegistersPerThread / stageRadix > 1)) && (sc->registerBoost == 1) && (stageSize < sc->warpSize))
		sc->useCoalescedLUTUploadToSM = 1;
	else
		sc->useCoalescedLUTUploadToSM = 0;

	for (uint64_t k = 0; k < sc->registerBoost; k++) {
		if (logicalGroupSize != sc->localSize[0]) {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
			if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue;
			if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
				uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize;
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}

			sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->LUT)
				sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum);
			else
				sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) {
				//if(sc->readToRegisters==0){
				for (uint64_t i = 0; i < stageRadix; i++) {
					uint64_t id = j + i * logicalRegistersPerThread / stageRadix;
					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;

					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 ";\n", sc->sdataID, sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

					if (sc->resolveBankConflictFirstStages == 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}

					if (sc->localSize[1] > 1) {
						sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s];\n", sc->regIDs[id], sc->sdataID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (!sc->useCoalescedLUTUploadToSM) {
				char** regID = (char**)malloc(sizeof(char*) * stageRadix);
				if (regID) {
					for (uint64_t i = 0; i < stageRadix; i++) {
						regID[i] = (char*)malloc(sizeof(char) * 50);
						if (!regID[i]) {
							for (uint64_t p = 0; p < i; p++) {
								free(regID[p]);
								regID[p] = 0;
							}
							free(regID);
							regID = 0;
							return VKFFT_ERROR_MALLOC_FAILED;
						}
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(regID[i], "%s", sc->regIDs[id]);
						/*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread)
							sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]);
						else
							sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/

					}
					res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < stageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(sc->regIDs[id], "%s", regID[i]);
					}
					for (uint64_t i = 0; i < stageRadix; i++) {
						free(regID[i]);
						regID[i] = 0;
					}
					free(regID);
					regID = 0;
				}
				else
					return VKFFT_ERROR_MALLOC_FAILED;
			}

			if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		if (logicalGroupSize != sc->localSize[0]) {
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (sc->useCoalescedLUTUploadToSM) {
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;

			sc->useCoalescedLUTUploadToSM = 1;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s;\n", sc->sdataID, sc->gl_LocalInvocationID_x);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->localSize[1] > 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 "*%s;\n", sc->sdataID, sc->sdataID, sc->localSize[0], sc->gl_LocalInvocationID_y);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}

			for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])); i++) {
				if (i > 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 ";\n", sc->sdataID, sc->sdataID, sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->sdataID, numLUTelementsStage * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = twiddleLUT[%s+%" PRIu64 "];\n", sc->sdataID, sc->sdataID, (stageSizeSum));
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (logicalGroupSize != sc->localSize[0]) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
				if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue;
				if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
					uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize;
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				char** regID = (char**)malloc(sizeof(char*) * stageRadix);
				if (regID) {
					for (uint64_t i = 0; i < stageRadix; i++) {
						regID[i] = (char*)malloc(sizeof(char) * 50);
						if (!regID[i]) {
							for (uint64_t p = 0; p < i; p++) {
								free(regID[p]);
								regID[p] = 0;
							}
							free(regID);
							regID = 0;
							return VKFFT_ERROR_MALLOC_FAILED;
						}
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(regID[i], "%s", sc->regIDs[id]);
						/*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread)
							sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]);
						else
							sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/

					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (!sc->useCoalescedLUTUploadToSM) {
						if (sc->LUT)
							sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum);
						else
							sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < stageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(sc->regIDs[id], "%s", regID[i]);
					}
					for (uint64_t i = 0; i < stageRadix; i++) {
						free(regID[i]);
						regID[i] = 0;
					}
					free(regID);
					regID = 0;
				}
				else
					return VKFFT_ERROR_MALLOC_FAILED;
				if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (logicalGroupSize != sc->localSize[0]) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			if ((stageSize == 1) && (sc->cacheShuffle)) {
				for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
					uint64_t id = i + k * logicalRegistersPerThread;
					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
					sc->tempLen = sprintf(sc->tempStr, "\
		shuffle[%" PRIu64 "]=%s;\n", i, sc->regIDs[id]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
					uint64_t id = i + k * logicalRegistersPerThread;
					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s=shuffle[(%" PRIu64 "+tshuffle)%%(%" PRIu64 ")];\n", sc->regIDs[id], i, logicalRegistersPerThread);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
	}

	res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
	if (res != VKFFT_SUCCESS) return res;
	res = appendZeropadEnd(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult appendRadixStageStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#endif

	char convolutionInverse[10] = "";
	if (sc->convolutionStep) {
		if (stageAngle < 0)
			sprintf(convolutionInverse, ", 0");
		else
			sprintf(convolutionInverse, ", 1");
	}
	uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
	uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
	uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread);
	if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))
	{
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
	}
	res = appendZeropadStart(sc);
	if (res != VKFFT_SUCCESS) return res;
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;


	//upload second stage of LUT to sm
	uint64_t numLUTelementsStage = 0;
	switch (stageRadix) {
	case 2:
		numLUTelementsStage = 1;
		break;
	case 4:
		numLUTelementsStage = 2;
		break;
	case 8:
		numLUTelementsStage = 3;
		break;
	case 16:
		numLUTelementsStage = 4;
		break;
	case 32:
		numLUTelementsStage = 5;
		break;
	default:
		if (stageRadix < sc->fixMinRaderPrimeMult)
			numLUTelementsStage = stageRadix - 1;
		else
			numLUTelementsStage = stageRadix;
		break;
	}
	if ((sc->LUT) && (stageSize > 1) && ((((numLUTelementsStage >= 4) && (sc->fftDim >= 1024)) || (((numLUTelementsStage >= 3) && (sc->fftDim < 1024)))) || (logicalRegistersPerThread / stageRadix > 1)) && (sc->registerBoost == 1) && (stageSize < sc->warpSize))
		sc->useCoalescedLUTUploadToSM = 1;
	else
		sc->useCoalescedLUTUploadToSM = 0;


	for (uint64_t k = 0; k < sc->registerBoost; k++) {
		if (logicalGroupSize != sc->localSize[1]) {
			sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
			if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue;
			if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
				uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize;
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->LUT)
				sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum);
			else
				sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) {
				for (uint64_t i = 0; i < stageRadix; i++) {
					uint64_t id = j + i * logicalRegistersPerThread / stageRadix;
					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s*(%s+%" PRIu64 ")+%s];\n", sc->regIDs[id], sc->sharedStride, sc->gl_LocalInvocationID_y, j * logicalGroupSize + i * sc->fftDim / stageRadix, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (!sc->useCoalescedLUTUploadToSM) {
				char** regID = (char**)malloc(sizeof(char*) * stageRadix);
				if (regID) {
					for (uint64_t i = 0; i < stageRadix; i++) {
						regID[i] = (char*)malloc(sizeof(char) * 50);
						if (!regID[i]) {
							for (uint64_t p = 0; p < i; p++) {
								free(regID[p]);
								regID[p] = 0;
							}
							free(regID);
							regID = 0;
							return VKFFT_ERROR_MALLOC_FAILED;
						}
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(regID[i], "%s", sc->regIDs[id]);
						/*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread)
							sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix);
						else
							sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/

					}
					res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < stageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(sc->regIDs[id], "%s", regID[i]);
					}
					for (uint64_t i = 0; i < stageRadix; i++) {
						free(regID[i]);
						regID[i] = 0;
					}
					free(regID);
					regID = 0;
				}
				else
					return VKFFT_ERROR_MALLOC_FAILED;
			}
			if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		if (logicalGroupSize != sc->localSize[1]) {
			sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		//upload second stage of LUT to sm
		if (sc->useCoalescedLUTUploadToSM) {
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;

			sc->useCoalescedLUTUploadToSM = 1;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s;\n", sc->sdataID, sc->gl_LocalInvocationID_x);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 "*%s;\n", sc->sdataID, sc->sdataID, sc->localSize[0], sc->gl_LocalInvocationID_y);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;

			for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])); i++) {
				if (i > 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = %s + %" PRIu64 ";\n", sc->sdataID, sc->sdataID, sc->localSize[0] * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if(%s<%" PRIu64 "){\n", sc->sdataID, numLUTelementsStage * stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s] = twiddleLUT[%s+%" PRIu64 "];\n", sc->sdataID, sc->sdataID, (stageSizeSum));
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (logicalGroupSize != sc->localSize[1]) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
				if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue;
				if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
					uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize;
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				char** regID = (char**)malloc(sizeof(char*) * stageRadix);
				if (regID) {
					for (uint64_t i = 0; i < stageRadix; i++) {
						regID[i] = (char*)malloc(sizeof(char) * 50);
						if (!regID[i]) {
							for (uint64_t p = 0; p < i; p++) {
								free(regID[p]);
								regID[p] = 0;
							}
							free(regID);
							regID = 0;
							return VKFFT_ERROR_MALLOC_FAILED;
						}
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(regID[i], "%s", sc->regIDs[id]);
						/*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread)
							sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix);
						else
							sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/

					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->LUT)
						sc->tempLen = sprintf(sc->tempStr, "		LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum);
					else
						sc->tempLen = sprintf(sc->tempStr, "		angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < stageRadix; i++) {
						uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
						sprintf(sc->regIDs[id], "%s", regID[i]);
					}
					for (uint64_t i = 0; i < stageRadix; i++) {
						free(regID[i]);
						regID[i] = 0;
					}
					free(regID);
					regID = 0;
				}
				else
					return VKFFT_ERROR_MALLOC_FAILED;
				if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
					sc->tempLen = sprintf(sc->tempStr, "		}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			if (logicalGroupSize != sc->localSize[1]) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
	}
	res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
	if (res != VKFFT_SUCCESS) return res;
	res = appendZeropadEnd(sc);
	if (res != VKFFT_SUCCESS) return res;
	if (stageSize == 1) {
		sc->tempLen = sprintf(sc->tempStr, "		%s = %" PRIu64 ";\n", sc->sharedStride, sc->localSize[0]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	return res;
}
static inline VkFFTResult appendRadixStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t shuffleType) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (sc->rader_generator[stageID]) {
		for (uint64_t i = 0; i < sc->numRaderPrimes; i++) {
			if (sc->raderContainer[i].prime == stageRadix) {
				sc->currentRaderContainer = &sc->raderContainer[i];
			}
		}
		if (sc->currentRaderContainer->type) {
			switch (shuffleType) {
			case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
				res = appendMultRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 0);
				if (res != VKFFT_SUCCESS) return res;
				//appendBarrierVkFFT(sc, 1);
				break;
			}
			case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
				res = appendMultRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 1);
				if (res != VKFFT_SUCCESS) return res;
				//appendBarrierVkFFT(sc, 1);
				break;
			}
			}
		}
		else {
			switch (shuffleType) {
			case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
				res = appendFFTRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 0);
				if (res != VKFFT_SUCCESS) return res;
				//appendBarrierVkFFT(sc, 1);
				break;
			}
			case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
				res = appendFFTRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 1);
				if (res != VKFFT_SUCCESS) return res;
				//appendBarrierVkFFT(sc, 1);
				break;
			}
			}
		}
	}
	else {
		switch (shuffleType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
			res = appendRadixStageNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix);
			if (res != VKFFT_SUCCESS) return res;
			//appendBarrierVkFFT(sc, 1);
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
			res = appendRadixStageStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix);
			if (res != VKFFT_SUCCESS) return res;
			//appendBarrierVkFFT(sc, 1);
			break;
		}
		}
	}
	return res;
}

static inline VkFFTResult appendRegisterBoostShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, uint64_t stageSize, uint64_t stageRadixPrev, uint64_t stageRadix, long double stageAngle) {
	VkFFTResult res = VKFFT_SUCCESS;
	/*if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
		uint64_t bluesteinInverseNormalize = 1;
		if ((sc->useBluesteinFFT) && (stageAngle > 0) && (stageSize == 1) && (sc->normalize) && (sc->axis_upload_id == 0)) bluesteinInverseNormalize = sc->bluesteinNormalizeSize;
		char stageNormalization[50] = "";
		if ((stageSize == 1) && (sc->performDCT) && (sc->actualInverse)) {
			if (sc->performDCT == 4)
				sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 4 * bluesteinInverseNormalize);
			else
				sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 2 * bluesteinInverseNormalize);
		}
		else
			sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * bluesteinInverseNormalize);
		uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
		for (uint64_t k = 0; k < sc->registerBoost; ++k) {
			for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
				res = VkDivComplexNumber(sc, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], stageNormalization);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
	}*/
	return res;
}

static inline VkFFTResult appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#endif
	char stageNormalization[50] = "";
	uint64_t normalizationValue = 1;
	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) {
		if ((sc->performDCT) && (sc->actualInverse)) {
			if (sc->performDCT == 1)
				normalizationValue = (sc->sourceFFTSize - 1) * 2;
			else
				normalizationValue = sc->sourceFFTSize * 2;
		}
		else
			normalizationValue = sc->sourceFFTSize;
	}
	if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) {
		normalizationValue *= sc->fft_dim_full;
	}
	if (normalizationValue != 1) {
		sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending);
	}
	char tempNum[50] = "";

	uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
	uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
	uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
	uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;

	uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread);
	uint64_t logicalGroupSizeNext = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThreadNext);
	if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)))) || (sc->performDCT)))
	{
		res = appendBarrierVkFFT(sc, 1);
		if (res != VKFFT_SUCCESS) return res;
	}
	//if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT)) {
	if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT))) {
		if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
			char** tempID;
			tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
			if (tempID) {
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					tempID[i] = (char*)malloc(sizeof(char) * 50);
					if (!tempID[i]) {
						for (uint64_t j = 0; j < i; j++) {
							free(tempID[j]);
							tempID[j] = 0;
						}
						free(tempID);
						tempID = 0;
						return VKFFT_ERROR_MALLOC_FAILED;
					}
				}
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;

				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
					uint64_t t = 0;
					if (k > 0) {
						res = appendBarrierVkFFT(sc, 2);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadStart(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
						if (res != VKFFT_SUCCESS) return res;
						if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (logicalGroupSize != sc->localSize[0]) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
						if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
							if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
								uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize;
								sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize);
							res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_x, tempNum);
							if (res != VKFFT_SUCCESS) return res;
							res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
							if (res != VKFFT_SUCCESS) return res;
							sprintf(tempNum, "%" PRIu64 "", stageSize);
							res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
							if (res != VKFFT_SUCCESS) return res;
							res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID);
							if (res != VKFFT_SUCCESS) return res;
							sprintf(tempNum, "%" PRIu64 "", stageRadix);
							res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
							if (res != VKFFT_SUCCESS) return res;
							res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID);
							if (res != VKFFT_SUCCESS) return res;
						}
						/*sc->tempLen = sprintf(sc->tempStr, "\
		stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\
		blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\
		inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/
						if ((stageSize == 1) && (sc->cacheShuffle)) {
							for (uint64_t i = 0; i < stageRadix; i++) {
								uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
								id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
								sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
								t++;
								if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
									sprintf(tempNum, "%" PRIu64 "", i);
									res = VkAddReal(sc, sc->sdataID, tempNum, sc->tshuffle);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%" PRIu64 "", logicalRegistersPerThread);
									res = VkModReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									sprintf(tempNum, "%" PRIu64 "", stageSize);
									res = VkMulReal(sc, sc->sdataID, sc->sdataID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									if (sc->localSize[1] > 1) {
										res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride);
										if (res != VKFFT_SUCCESS) return res;
										res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
										if (res != VKFFT_SUCCESS) return res;
									}
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->inoutID);
									if (res != VKFFT_SUCCESS) return res;

									//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "", i, logicalRegistersPerThread, stageSize);
									if (strcmp(stageNormalization, "")) {
										res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
										if (res != VKFFT_SUCCESS) return res;
									}
									res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
									if (res != VKFFT_SUCCESS) return res;
								}
								/*sc->tempLen = sprintf(sc->tempStr, "\
	sdata[sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "] = temp%s%s;\n", i, logicalRegistersPerThread, stageSize, sc->regIDs[id], stageNormalization);*/
							}
						}
						else {
							for (uint64_t i = 0; i < stageRadix; i++) {
								uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
								id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
								sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
								t++;
								if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
									sprintf(tempNum, "%" PRIu64 "", i * stageSize);
									res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum);
									if (res != VKFFT_SUCCESS) return res;
									if ((stageSize <= sc->numSharedBanks / 2) && (sc->fftDim > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != sc->fftDim / sc->registerBoost) && ((sc->fftDim & (sc->fftDim - 1)) == 0) && (stageSize * stageRadix != sc->fftDim)) {
										if (sc->resolveBankConflictFirstStages == 0) {
											sc->resolveBankConflictFirstStages = 1;
											sc->tempLen = sprintf(sc->tempStr, "\
	%s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
										sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
										res = VkAppendLine(sc);
										if (res != VKFFT_SUCCESS) return res;

									}
									else {
										if (sc->resolveBankConflictFirstStages == 1) {
											sc->resolveBankConflictFirstStages = 0;
											sc->tempLen = sprintf(sc->tempStr, "\
	%s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict);
											res = VkAppendLine(sc);
											if (res != VKFFT_SUCCESS) return res;
										}
									}
									if (sc->localSize[1] > 1) {
										res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride);
										if (res != VKFFT_SUCCESS) return res;
										res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
										if (res != VKFFT_SUCCESS) return res;
									}
									//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
									if (strcmp(stageNormalization, "")) {
										res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
										if (res != VKFFT_SUCCESS) return res;
									}
									res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
									if (res != VKFFT_SUCCESS) return res;
								}
								/*sc->tempLen = sprintf(sc->tempStr, "\
	sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
							}
						}
						if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
							if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "	}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
					if (logicalGroupSize != sc->localSize[0]) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
						sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]);
						t++;
					}
					t = 0;
					if (sc->registerBoost > 1) {
						if (logicalGroupSize * logicalStoragePerThread > sc->fftDim)
						{
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = appendBarrierVkFFT(sc, 2);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadStart(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
						if (res != VKFFT_SUCCESS) return res;
						if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThreadNext, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) {
							for (uint64_t i = 0; i < stageRadixNext; i++) {
								uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalStoragePerThreadNext / stageRadixNext;
								id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext;
								//resID[t + k * sc->registers_per_thread] = sc->regIDs[id];
								sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext);
								res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_x, tempNum);
								if (res != VKFFT_SUCCESS) return res;
								if (sc->localSize[1] > 1) {
									res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride);
									if (res != VKFFT_SUCCESS) return res;
									res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
									if (res != VKFFT_SUCCESS) return res;
								}
								if (sc->resolveBankConflictFirstStages == 1) {
									sc->tempLen = sprintf(sc->tempStr, "\
	%s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
									res = VkAppendLine(sc);
									if (res != VKFFT_SUCCESS) return res;
								}
								//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext);
								res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID);
								if (res != VKFFT_SUCCESS) return res;
								/*sc->tempLen = sprintf(sc->tempStr, "\
		temp%s = sdata[sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/
								t++;
							}

						}
						if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim)
						{
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					//printf("0 - %s\n", resID[i]);
					sprintf(sc->regIDs[i], "%s", tempID[i]);
					//sprintf(resID[i], "%s", tempID[i]);
					//printf("1 - %s\n", resID[i]);
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					free(tempID[i]);
					tempID[i] = 0;
				}
				free(tempID);
				tempID = 0;
			}
			else
				return VKFFT_ERROR_MALLOC_FAILED;
		}
		else {
			char** tempID;
			tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
			if (tempID) {
				//resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					tempID[i] = (char*)malloc(sizeof(char) * 50);
					if (!tempID[i]) {
						for (uint64_t j = 0; j < i; j++) {
							free(tempID[j]);
							tempID[j] = 0;
						}
						free(tempID);
						tempID = 0;
						return VKFFT_ERROR_MALLOC_FAILED;
					}
				}
				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
					for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
						for (uint64_t i = 0; i < stageRadix; i++) {
							uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
							id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
							sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
						}
					}
					for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
						sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]);
					}
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					sprintf(sc->regIDs[i], "%s", tempID[i]);
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					free(tempID[i]);
					tempID[i] = 0;
				}
				free(tempID);
				tempID = 0;
			}
			else
				return VKFFT_ERROR_MALLOC_FAILED;
		}
	}
	else {
		res = appendZeropadStart(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;

		if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
			for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
				if (strcmp(stageNormalization, "")) {
					res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);
				}
				if (res != VKFFT_SUCCESS) return res;
				/*sc->tempLen = sprintf(sc->tempStr, "\
	temp%s = temp%s%s;\n", sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);*/
			}
		}

		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	return res;
}
static inline VkFFTResult appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#endif

	char tempNum[50] = "";

	uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
	uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
	uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];//(sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
	uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;

	uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread);
	uint64_t logicalGroupSizeNext = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThreadNext);
	char stageNormalization[50] = "";
	uint64_t normalizationValue = 1;
	if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) {
		if ((sc->performDCT) && (sc->actualInverse)) {
			if (sc->performDCT == 1)
				normalizationValue = (sc->sourceFFTSize - 1) * 2;
			else
				normalizationValue = sc->sourceFFTSize * 2;
		}
		else
			normalizationValue = sc->sourceFFTSize;
	}
	if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) {
		normalizationValue *= sc->fft_dim_full;
	}
	if (normalizationValue != 1) {
		sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending);
	}
	if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT)))
	{
		res = appendBarrierVkFFT(sc, 2);
		if (res != VKFFT_SUCCESS) return res;
	}
	if (stageSize == sc->fftDim / stageRadix) {
		sc->tempLen = sprintf(sc->tempStr, "		%s = %" PRIu64 ";\n", sc->sharedStride, sc->sharedStrideReadWriteConflict);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT))) {
		//if (sc->writeFromRegisters == 0) {
			//appendBarrierVkFFT(sc, 2);
		if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
			char** tempID;
			tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
			if (tempID) {
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					tempID[i] = (char*)malloc(sizeof(char) * 50);
					if (!tempID[i]) {
						for (uint64_t j = 0; j < i; j++) {
							free(tempID[j]);
							tempID[j] = 0;
						}
						free(tempID);
						tempID = 0;
						return VKFFT_ERROR_MALLOC_FAILED;
					}
				}
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;

				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
					uint64_t t = 0;
					if (k > 0) {
						res = appendBarrierVkFFT(sc, 2);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadStart(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
						if (res != VKFFT_SUCCESS) return res;
						if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					if (logicalGroupSize != sc->localSize[1]) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
						if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
							if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
								uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize;
								sc->tempLen = sprintf(sc->tempStr, "\
		if (%s  < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut);
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
							sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize);
							res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_y, tempNum);
							if (res != VKFFT_SUCCESS) return res;
							res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID);
							if (res != VKFFT_SUCCESS) return res;
							sprintf(tempNum, "%" PRIu64 "", stageSize);
							res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum);
							if (res != VKFFT_SUCCESS) return res;
							res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID);
							if (res != VKFFT_SUCCESS) return res;
							sprintf(tempNum, "%" PRIu64 "", stageRadix);
							res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum);
							if (res != VKFFT_SUCCESS) return res;
							res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID);
							if (res != VKFFT_SUCCESS) return res;
						}
						/*sc->tempLen = sprintf(sc->tempStr, "\
		stageInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") %% (%" PRIu64 ");\n\
		blockInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") - stageInvocationID;\n\
		inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/
						for (uint64_t i = 0; i < stageRadix; i++) {
							uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
							id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
							sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
							t++;
							if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
								sprintf(tempNum, "%" PRIu64 "", i * stageSize);
								res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum);
								if (res != VKFFT_SUCCESS) return res;
								res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID);
								if (res != VKFFT_SUCCESS) return res;
								res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
								if (res != VKFFT_SUCCESS) return res;
								//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize);
								if (strcmp(stageNormalization, "")) {
									res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization);
									if (res != VKFFT_SUCCESS) return res;
								}
								res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]);
								if (res != VKFFT_SUCCESS) return res;
							}
							/*sc->tempLen = sprintf(sc->tempStr, "\
		sdata[gl_WorkGroupSize.x*(inoutID+%" PRIu64 ")+gl_LocalInvocationID.x] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/
						}
						if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) {
							if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) {
								sc->tempLen = sprintf(sc->tempStr, "	}\n");
								res = VkAppendLine(sc);
								if (res != VKFFT_SUCCESS) return res;
							}
						}
					}
					if (logicalGroupSize != sc->localSize[1]) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
						sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]);
						t++;
					}
					t = 0;
					if (sc->registerBoost > 1) {
						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = appendBarrierVkFFT(sc, 2);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadStart(sc);
						if (res != VKFFT_SUCCESS) return res;
						res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
						if (res != VKFFT_SUCCESS) return res;
						if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThreadNext, sc->fftDim);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) {
							for (uint64_t i = 0; i < stageRadixNext; i++) {
								uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalRegistersPerThreadNext / stageRadixNext;
								id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext;
								sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext);
								res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_y, tempNum);
								if (res != VKFFT_SUCCESS) return res;
								res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID);
								if (res != VKFFT_SUCCESS) return res;
								res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x);
								if (res != VKFFT_SUCCESS) return res;
								//sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext);
								res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID);
								if (res != VKFFT_SUCCESS) return res;
								/*sc->tempLen = sprintf(sc->tempStr, "\
		temp%s = sdata[gl_WorkGroupSize.x*(gl_LocalInvocationID.y+%" PRIu64 ")+gl_LocalInvocationID.x];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/
								t++;
							}
						}
						if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "	}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					else {
						res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
						if (res != VKFFT_SUCCESS) return res;
						res = appendZeropadEnd(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					sprintf(sc->regIDs[i], "%s", tempID[i]);
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					free(tempID[i]);
					tempID[i] = 0;
				}
				free(tempID);
				tempID = 0;
			}
			else
				return VKFFT_ERROR_MALLOC_FAILED;
		}
		else {
			char** tempID;
			tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
			if (tempID) {
				//resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost);
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					tempID[i] = (char*)malloc(sizeof(char) * 50);
					if (!tempID[i]) {
						for (uint64_t j = 0; j < i; j++) {
							free(tempID[j]);
							tempID[j] = 0;
						}
						free(tempID);
						tempID = 0;
						return VKFFT_ERROR_MALLOC_FAILED;
					}
				}
				for (uint64_t k = 0; k < sc->registerBoost; ++k) {
					for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
						for (uint64_t i = 0; i < stageRadix; i++) {
							uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
							id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
							sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]);
						}
					}
					for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) {
						sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]);
					}
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					sprintf(sc->regIDs[i], "%s", tempID[i]);
				}
				for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) {
					free(tempID[i]);
					tempID[i] = 0;
				}
				free(tempID);
				tempID = 0;
			}
			else
				return VKFFT_ERROR_MALLOC_FAILED;
		}
	}
	else {
		res = appendZeropadStart(sc);
		if (res != VKFFT_SUCCESS) return res;
		res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
		if (res != VKFFT_SUCCESS) return res;
		if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) {
			sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) {
			for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
				if (strcmp(stageNormalization, "")) {
					res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);
				}
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		if (sc->localSize[1] * logicalRegistersPerThread > sc->fftDim)
		{
			sc->tempLen = sprintf(sc->tempStr, "	}\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
		res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
		if (res != VKFFT_SUCCESS) return res;
		res = appendZeropadEnd(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	return res;
}
static inline VkFFTResult appendRadixShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext, uint64_t stageID, uint64_t shuffleType) {
	VkFFTResult res = VKFFT_SUCCESS;
	if (sc->rader_generator[stageID] == 0) {
		switch (shuffleType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
			res = appendRadixShuffleNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext);
			if (res != VKFFT_SUCCESS) return res;
			//appendBarrierVkFFT(sc, 1);
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
			res = appendRadixShuffleStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext);
			if (res != VKFFT_SUCCESS) return res;
			//appendBarrierVkFFT(sc, 1);
			break;
		}
		}
	}
	return res;
}

static inline VkFFTResult appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t shuffleType, uint64_t start) {
	VkFFTResult res = VKFFT_SUCCESS;
	switch (shuffleType) {
	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {
		uint64_t logicalStoragePerThread;
		if (start == 1) {
			logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		}
		else {
			logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		}
		uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
		if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				if (k > 0) {
					res = appendBarrierVkFFT(sc, 2);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
				if (start == 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else
				{
					for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 2);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
				if (start == 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	%s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * logicalGroupSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	%s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		break;
	}
	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: {
		uint64_t logicalStoragePerThread;
		if (start == 1) {
			logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		}
		else {
			logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
		}
		uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
		if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
			for (uint64_t k = 0; k < sc->registerBoost; k++) {
				if (k > 0) {
					res = appendBarrierVkFFT(sc, 2);
					if (res != VKFFT_SUCCESS) return res;
				}
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
				if (start == 0) {
					sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else
				{
					for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->regIDs[i + k * sc->registers_per_thread]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = appendBarrierVkFFT(sc, 2);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadStart(sc);
				if (res != VKFFT_SUCCESS) return res;
				res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
				if (res != VKFFT_SUCCESS) return res;
				if (start == 1) {
					sc->tempLen = sprintf(sc->tempStr, "\
	if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	%s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "	}\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) {
						sc->tempLen = sprintf(sc->tempStr, "\
	%s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
				if (res != VKFFT_SUCCESS) return res;
				res = appendZeropadEnd(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}

		break;
	}
	}
	return res;
}

static inline VkFFTResult appendCoordinateRegisterStore(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) {
	VkFFTResult res = VKFFT_SUCCESS;
	if ((!sc->writeFromRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) {
		switch (readType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c
		{
			uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->matrixConvolution == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
	switch (coordinate) {\n\
	case 0:\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
				sc->tempLen = sprintf(sc->tempStr, "			break;\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
					sc->tempLen = sprintf(sc->tempStr, "\
	case %" PRIu64 ":\n", i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s_%" PRIu64 " = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t j = 1; j < used_registers_read; j++) {
						if (sc->localSize[0] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * j);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		%s_%" PRIu64 " = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[j], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->localSize[0] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//appendBarrierVkFFT(sc, 3);
					sc->tempLen = sprintf(sc->tempStr, "			break;\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			break;
		}
		case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c
		{
			uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->matrixConvolution == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
	switch (coordinate) {\n\
	case 0:\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		%s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
				sc->tempLen = sprintf(sc->tempStr, "			break;\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
					sc->tempLen = sprintf(sc->tempStr, "\
	case %" PRIu64 ":\n", i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		%s_%" PRIu64 " = sdata[%s*(%s)+%s];\n", sc->regIDs[0], i, sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t j = 1; j < used_registers_read; j++) {
						if (sc->localSize[1] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * j);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		%s_%" PRIu64 " = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[j], i, sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->localSize[1] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//appendBarrierVkFFT(sc, 3);
					sc->tempLen = sprintf(sc->tempStr, "			break;\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			break;
		}
		}
	}
	return res;
}
static inline VkFFTResult appendCoordinateRegisterPull(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) {
	VkFFTResult res = VKFFT_SUCCESS;
	if ((!sc->readToRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) {
		switch (readType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c
		{
			uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->matrixConvolution == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
			sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
			sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
		switch (coordinate) {\n\
		case 0:\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
			sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
			sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[0] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
				sc->tempLen = sprintf(sc->tempStr, "			break;\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
					sc->tempLen = sprintf(sc->tempStr, "\
		case %" PRIu64 ":\n", i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
			sdata[sharedStride * %s + %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t j = 1; j < used_registers_read; j++) {
						if (sc->localSize[0] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * j);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
			sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x, sc->regIDs[j], i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->localSize[0] * (i + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//appendBarrierVkFFT(sc, 3);
					sc->tempLen = sprintf(sc->tempStr, "			break;\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			break;
		}
		case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c
		{
			uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadStart(sc);
			if (res != VKFFT_SUCCESS) return res;
			res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
			if (res != VKFFT_SUCCESS) return res;
			if (sc->matrixConvolution == 1) {
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
					}
				}
				//appendBarrierVkFFT(sc, 3);
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "\
	switch (coordinate) {\n\
	case 0:\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < used_registers_read; i++) {
					if (sc->localSize[1] * (i + 1) > sc->fftDim) {
						sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->localSize[1] * (i + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}

					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				//appendBarrierVkFFT(sc, 3);
				sc->tempLen = sprintf(sc->tempStr, "			break;\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t i = 1; i < sc->matrixConvolution; i++) {
					sc->tempLen = sprintf(sc->tempStr, "\
	case %" PRIu64 ":\n", i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s*(%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					for (uint64_t j = 1; j < used_registers_read; j++) {
						if (sc->localSize[1] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * j);
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
						sc->tempLen = sprintf(sc->tempStr, "\
		sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[j], i);
						res = VkAppendLine(sc);
						if (res != VKFFT_SUCCESS) return res;
						if (sc->localSize[1] * (j + 1) > sc->fftDim) {
							sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
							res = VkAppendLine(sc);
							if (res != VKFFT_SUCCESS) return res;
						}
					}
					//appendBarrierVkFFT(sc, 3);
					sc->tempLen = sprintf(sc->tempStr, "			break;\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
			if (res != VKFFT_SUCCESS) return res;
			res = appendZeropadEnd(sc);
			if (res != VKFFT_SUCCESS) return res;
			break;
		}
		}
	}
	return res;
}
static inline VkFFTResult appendPreparationBatchedKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char vecType[30];
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
#endif
	char separateRegisterStore[100] = "_store";

	for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
		sc->tempLen = sprintf(sc->tempStr, "		%s %s%s;\n", vecType, sc->regIDs[i], separateRegisterStore);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t j = 1; j < sc->matrixConvolution; j++) {
			sc->tempLen = sprintf(sc->tempStr, "		%s %s_%" PRIu64 "%s;\n", vecType, sc->regIDs[i], j, separateRegisterStore);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
	}
	for (uint64_t i = 0; i < sc->registers_per_thread; i++) {
		//sc->tempLen = sprintf(sc->tempStr, "			temp%s[i]=temp[i];\n", separateRegisterStore);
		sc->tempLen = sprintf(sc->tempStr, "			%s%s=%s;\n", sc->regIDs[i], separateRegisterStore, sc->regIDs[i]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		for (uint64_t j = 1; j < sc->matrixConvolution; j++) {
			sc->tempLen = sprintf(sc->tempStr, "			%s_%" PRIu64 "%s=%s_%" PRIu64 ";\n", sc->regIDs[i], j, separateRegisterStore, sc->regIDs[i], j);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
		}
	}
	sc->tempLen = sprintf(sc->tempStr, "	for (%s batchID=0;  batchID < %" PRIu64 "; batchID++){\n", uintType, sc->numKernels);
	res = VkAppendLine(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult appendBluesteinConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t dataType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char shiftX[500] = "";
	if (sc->performWorkGroupShift[0])
		sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
	char requestCoordinate[100] = "";
	if (sc->convolutionStep) {
		if (sc->matrixConvolution > 1) {
			sprintf(requestCoordinate, "0");
		}
	}
	char requestBatch[100] = "";
	char separateRegisterStore[100] = "";
	if (sc->convolutionStep) {
		if (sc->numKernels > 1) {
			sprintf(requestBatch, "batchID");
			sprintf(separateRegisterStore, "_store");
		}
	}
	res = appendZeropadStart(sc);
	if (res != VKFFT_SUCCESS) return res;
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;
	for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
		sc->tempLen = sprintf(sc->tempStr, "		%s temp_real%" PRIu64 " = 0;\n", floatType, j);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "		%s temp_imag%" PRIu64 " = 0;\n", floatType, j);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	uint64_t used_registers_read = 1;
	switch (dataType) {
	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		break;
	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		break;
	}
	for (uint64_t i = 0; i < used_registers_read; i++) {
		switch (dataType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		{
			if (sc->localSize[0] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (sc->fftDim == sc->fft_dim_full) {
				sc->tempLen = sprintf(sc->tempStr, "		%s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "		%s = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				//sc->tempLen = sprintf(sc->tempStr, "		inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
			}
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		{
			if (sc->localSize[1] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			if (sc->fftDim == sc->fft_dim_full) {
				sc->tempLen = sprintf(sc->tempStr, "			%s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "		%s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		}
		char kernelName[100] = "";
		sprintf(kernelName, "BluesteinConvolutionKernel");
		if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full))
			sc->tempLen = sprintf(sc->tempStr, "		temp_real0 = %s[inoutID].x * %s%s.x + %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
		else
			sc->tempLen = sprintf(sc->tempStr, "		temp_real0 = %s[inoutID].x * %s%s.x - %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);

		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;

		if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full))
			sc->tempLen = sprintf(sc->tempStr, "		temp_imag0 = %s[inoutID].x * %s%s.y - %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
		else
			sc->tempLen = sprintf(sc->tempStr, "		temp_imag0 = %s[inoutID].x * %s%s.y + %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "		%s.x = temp_real0;\n", sc->regIDs[i]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "		%s.y = temp_imag0;\n", sc->regIDs[i]);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		switch (dataType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		{
			if (sc->localSize[0] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		{
			if (sc->localSize[1] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		}
	}
	res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
	if (res != VKFFT_SUCCESS) return res;
	res = appendZeropadEnd(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}

static inline VkFFTResult appendKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) {
	VkFFTResult res = VKFFT_SUCCESS;
	char convTypeLeft[20] = "";
	char convTypeRight[20] = "";
	if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) {
#if(VKFFT_BACKEND==0)
		sprintf(convTypeLeft, "float(");
		sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
		sprintf(convTypeLeft, "(float)");
		//sprintf(convTypeRight, "");
#elif(VKFFT_BACKEND==2)
		sprintf(convTypeLeft, "(float)");
		//sprintf(convTypeRight, "");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		sprintf(convTypeLeft, "(float)");
		//sprintf(convTypeRight, "");
#endif
	}
	if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) {
#if(VKFFT_BACKEND==0)
		sprintf(convTypeLeft, "double(");
		sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
		sprintf(convTypeLeft, "(double)");
		//sprintf(convTypeRight, "");
#elif(VKFFT_BACKEND==2)
		sprintf(convTypeLeft, "(double)");
		//sprintf(convTypeRight, "");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
		sprintf(convTypeLeft, "(double)");
		//sprintf(convTypeRight, "");
#endif
	}

	char shiftX[500] = "";
	if (sc->performWorkGroupShift[0])
		sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
	char requestCoordinate[100] = "";
	if (sc->convolutionStep) {
		if (sc->matrixConvolution > 1) {
			sprintf(requestCoordinate, "0");
		}
	}
	char index_x[2000] = "";
	char index_y[2000] = "";
	char requestBatch[100] = "";
	char separateRegisterStore[100] = "";
	if (sc->convolutionStep) {
		if (sc->numKernels > 1) {
			sprintf(requestBatch, "batchID");
			sprintf(separateRegisterStore, "_store");
		}
	}
	res = appendZeropadStart(sc);
	if (res != VKFFT_SUCCESS) return res;
	res = VkAppendLineFromInput(sc, sc->disableThreadsStart);
	if (res != VKFFT_SUCCESS) return res;
	for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
		sc->tempLen = sprintf(sc->tempStr, "		%s temp_real%" PRIu64 " = 0;\n", floatType, j);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
		sc->tempLen = sprintf(sc->tempStr, "		%s temp_imag%" PRIu64 " = 0;\n", floatType, j);
		res = VkAppendLine(sc);
		if (res != VKFFT_SUCCESS) return res;
	}
	uint64_t used_registers_read = 1;
	switch (dataType) {
	case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]);
		break;
	case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]);
		break;
	}
	for (uint64_t i = 0; i < used_registers_read; i++) {
		if (i > 0) {
			for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
				sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " = 0;\n", j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " = 0;\n", j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		switch (dataType) {
		case 0:
		{
			if (sc->fftDim == sc->fft_dim_full) {
				if (sc->localSize[1] == 1)
					sc->tempLen = sprintf(sc->tempStr, "		combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
				else
					sc->tempLen = sprintf(sc->tempStr, "		combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, i * sc->localSize[0] * sc->localSize[1]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				if ((1 + i) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) {
					sc->tempLen = sprintf(sc->tempStr, "		if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				if (sc->inputStride[0] > 1) {
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(index_x, "(combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]);
					uint64_t tempSaveInputOffset = sc->inputOffset;
					uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
					sc->inputOffset = sc->kernelOffset;
					sc->inputNumberByteSize = sc->kernelNumberByteSize;
					res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->inputOffset = tempSaveInputOffset;
					sc->inputNumberByteSize = tempSaveInputNumberByteSize;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "		inoutID = indexInput((combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch);
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sprintf(index_x, "(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->fftDim, sc->inputStride[1]);
					uint64_t tempSaveInputOffset = sc->inputOffset;
					uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
					sc->inputOffset = sc->kernelOffset;
					sc->inputNumberByteSize = sc->kernelNumberByteSize;
					res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch);
					if (res != VKFFT_SUCCESS) return res;
					sc->inputOffset = tempSaveInputOffset;
					sc->inputNumberByteSize = tempSaveInputNumberByteSize;
					sc->tempLen = sprintf(sc->tempStr, ";\n");
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					//sc->tempLen = sprintf(sc->tempStr, "		inoutID = indexInput((combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch);
				}
			}
			else {
				if (sc->localSize[0] * (i + 1) > sc->fftDim) {
					sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sprintf(index_x, "%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize);
				uint64_t tempSaveInputOffset = sc->inputOffset;
				uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
				sc->inputOffset = sc->kernelOffset;
				sc->inputNumberByteSize = sc->kernelNumberByteSize;
				res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch);
				if (res != VKFFT_SUCCESS) return res;
				sc->inputOffset = tempSaveInputOffset;
				sc->inputNumberByteSize = tempSaveInputNumberByteSize;
				sc->tempLen = sprintf(sc->tempStr, ";\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				//sc->tempLen = sprintf(sc->tempStr, "		inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch);
			}
			break;
		}
		case 1:
		{
			if (sc->localSize[1] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			sc->tempLen = sprintf(sc->tempStr, "			%s = ", sc->inoutID);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x);
			sprintf(index_y, "(%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")", sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim);
			uint64_t tempSaveInputOffset = sc->inputOffset;
			uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize;
			sc->inputOffset = sc->kernelOffset;
			sc->inputNumberByteSize = sc->kernelNumberByteSize;
			res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, index_y, requestCoordinate, requestBatch);
			if (res != VKFFT_SUCCESS) return res;
			sc->inputOffset = tempSaveInputOffset;
			sc->inputNumberByteSize = tempSaveInputNumberByteSize;
			sc->tempLen = sprintf(sc->tempStr, ";\n");
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			//sc->tempLen = sprintf(sc->tempStr, "		inoutID = indexInput((%s%s) %% (%" PRIu64 "), (%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim, requestCoordinate, requestBatch);
			break;
		}
		}
		char kernelName[100] = "";
		sprintf(kernelName, "kernel_obj");
		if ((sc->kernelBlockNum == 1) || (sc->useBluesteinFFT)) {
			for (uint64_t j = 0; j < sc->matrixConvolution; j++) {
				for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
					uint64_t k = 0;
					if (sc->symmetricKernel) {
						k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
					}
					else {
						k = (j * sc->matrixConvolution + l);
					}
					if (sc->conjugateConvolution == 0) {
						if (l == 0)
							sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.x - %s%s[inoutID+%" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore);
						else
							sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.x - %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore);
					}
					else {
						if (l == 0)
							sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.x + %s%s[inoutID+%" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore);
						else
							sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.x + %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore);
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
					uint64_t k = 0;
					if (sc->symmetricKernel) {
						k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
					}
					else {
						k = (j * sc->matrixConvolution + l);
					}
					if (sc->conjugateConvolution == 0) {
						if (l == 0)
							sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y + %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore);
						else
							sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y + %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore);
					}
					else {
						if (sc->conjugateConvolution == 1) {
							if (l == 0)
								sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x - %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y ;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore);
							else
								sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x - %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore);
						}
						else {
							if (l == 0)
								sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y - %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore);
							else
								sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y - %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore);
						}
					}
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

				}
			}
			if (sc->crossPowerSpectrumNormalization) {
#if(VKFFT_BACKEND==0)
				sc->tempLen = sprintf(sc->tempStr, "		w.x = inversesqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
#elif(VKFFT_BACKEND==1)
				sc->tempLen = sprintf(sc->tempStr, "		w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
#elif(VKFFT_BACKEND==2)
				sc->tempLen = sprintf(sc->tempStr, "		w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
				sc->tempLen = sprintf(sc->tempStr, "		w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
#endif
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		%s.x = temp_real0 * w.x;\n", sc->regIDs[i]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		%s.y = temp_imag0 * w.x;\n", sc->regIDs[i]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			else {
				sc->tempLen = sprintf(sc->tempStr, "		%s.x = temp_real0;\n", sc->regIDs[i]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		%s.y = temp_imag0;\n", sc->regIDs[i]);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			for (uint64_t l = 1; l < sc->matrixConvolution; l++) {
				if (sc->crossPowerSpectrumNormalization) {
#if(VKFFT_BACKEND==0)
					sc->tempLen = sprintf(sc->tempStr, "		w.x = inversesqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
#elif(VKFFT_BACKEND==1)
					sc->tempLen = sprintf(sc->tempStr, "		w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
#elif(VKFFT_BACKEND==2)
					sc->tempLen = sprintf(sc->tempStr, "		w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
					sc->tempLen = sprintf(sc->tempStr, "		w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l);
#endif
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s_%" PRIu64 ".x = temp_real%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s_%" PRIu64 ".y = temp_imag%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
				else {
					sc->tempLen = sprintf(sc->tempStr, "		%s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
					sc->tempLen = sprintf(sc->tempStr, "		%s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
		}
		else {
			for (uint64_t j = 0; j < sc->matrixConvolution; j++) {

				sc->tempLen = sprintf(sc->tempStr, "		%s temp_real%" PRIu64 " = 0;\n", floatType, j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
					uint64_t k = 0;
					if (sc->symmetricKernel) {
						k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
					}
					else {
						k = (j * sc->matrixConvolution + l);
					}
					if (l == 0)
						sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s%s.x - %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore);
					else
						sc->tempLen = sprintf(sc->tempStr, "		temp_real%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s_%" PRIu64 "%s.x - %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;

				}

				sc->tempLen = sprintf(sc->tempStr, "		%s temp_imag%" PRIu64 " = 0;\n", floatType, j);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				for (uint64_t l = 0; l < sc->matrixConvolution; l++) {
					uint64_t k = 0;
					if (sc->symmetricKernel) {
						k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l);
					}
					else {
						k = (j * sc->matrixConvolution + l);
					}
					if (l == 0)
						sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s%s.y + %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore);
					else
						sc->tempLen = sprintf(sc->tempStr, "		temp_imag%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s_%" PRIu64 "%s.y + %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return res;
				}
			}
			sc->tempLen = sprintf(sc->tempStr, "		%s.x = temp_real0;\n", sc->regIDs[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			sc->tempLen = sprintf(sc->tempStr, "		%s.y = temp_imag0;\n", sc->regIDs[i]);
			res = VkAppendLine(sc);
			if (res != VKFFT_SUCCESS) return res;
			for (uint64_t l = 1; l < sc->matrixConvolution; l++) {
				sc->tempLen = sprintf(sc->tempStr, "		%s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
				sc->tempLen = sprintf(sc->tempStr, "		%s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l);
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
		}
		switch (dataType) {
		case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144:
		{
			if (sc->localSize[0] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145:
		{
			if (sc->localSize[1] * (i + 1) > sc->fftDim) {
				sc->tempLen = sprintf(sc->tempStr, "\
		}\n");
				res = VkAppendLine(sc);
				if (res != VKFFT_SUCCESS) return res;
			}
			break;
		}
		}
	}
	res = VkAppendLineFromInput(sc, sc->disableThreadsEnd);
	if (res != VKFFT_SUCCESS) return res;
	res = appendZeropadEnd(sc);
	if (res != VKFFT_SUCCESS) return res;
	return res;
}
static inline VkFFTResult setWriteFromRegisters(VkFFTSpecializationConstantsLayout* sc, uint64_t writeType) {
	VkFFTResult res = VKFFT_SUCCESS;
	switch (writeType) {
	case 0: //single_c2c
	{
		if ((sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) {
			sc->writeFromRegisters = 0;
		}
		else
			sc->writeFromRegisters = 1;
		break;
	}
	case 1: //grouped_c2c
	{
		if ((sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) {
			sc->writeFromRegisters = 0;
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
		}
		else
			sc->writeFromRegisters = 1;
		break;
	}
	case 2: //single_c2c_strided
	{
		if ((sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) {
			sc->writeFromRegisters = 0;
		}
		else
			sc->writeFromRegisters = 1;
		break;
	}
	case 5://single_r2c
	{
		sc->writeFromRegisters = 0;
		break;
	}
	case 6: //single_c2r
	{
		if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) {
			sc->writeFromRegisters = 0;
		}
		else
			sc->writeFromRegisters = 1;
		break;
	}
	case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145:
	{
		sc->writeFromRegisters = 0;
		break;
	}
	}
	return res;
}
static inline VkFFTResult appendWriteDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t writeType) {
	VkFFTResult res = VKFFT_SUCCESS;
	long double double_PI = 3.14159265358979323846264338327950288419716939937510L;
	char vecType[30];
	char outputsStruct[20] = "";
	char LFending[4] = "";
	if (!strcmp(floatType, "float")) sprintf(LFending, "f");
#if(VKFFT_BACKEND==0)
	if (!strcmp(floatType, "float")) sprintf(vecType, "vec2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2");
	if (sc->outputBufferBlockNum == 1)
		sprintf(outputsStruct, "outputs");
	else
		sprintf(outputsStruct, ".outputs");
	if (!strcmp(floatType, "double")) sprintf(LFending, "LF");
	char cosDef[20] = "cos";
	char sinDef[20] = "sin";
#elif(VKFFT_BACKEND==1)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(outputsStruct, "outputs");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
#elif(VKFFT_BACKEND==2)
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(outputsStruct, "outputs");
	if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	char cosDef[20] = "__cosf";
	char sinDef[20] = "__sinf";
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
	if (!strcmp(floatType, "float")) sprintf(vecType, "float2");
	if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
	sprintf(outputsStruct, "outputs");
	//if (!strcmp(floatType, "double")) sprintf(LFending, "l");
	char cosDef[20] = "native_cos";
	char sinDef[20] = "native_sin";
#endif
	char convTypeLeft[20] = "";
	char convTypeRight[20] = "";
	if ((!strcmp(floatTypeMemory, "half")) && (strcmp(floatType, "half"))) {
		if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
			sprintf(convTypeLeft, "float16_t(");
			sprintf(convTypeRight, ")");
		}
		else {
			sprintf(convTypeLeft, "f16vec2(");
			sprintf(convTypeRight, ")");
		}
	}
	if ((!strcmp(floatTypeMemory, "float")) && (strcmp(floatType, "float"))) {
		if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "float(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "(float)");
			//sprintf(convTypeRight, "");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "(float)");
			//sprintf(convTypeRight, "");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "(float)");
			//sprintf(convTypeRight, "");
#endif
		}
		else {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "vec2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "conv_float2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "conv_float2(");
			sprintf(convTypeRight, ")");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "conv_float2(");
			sprintf(convTypeRight, ")");
#endif
		}
	}
	if ((!strcmp(floatTypeMemory, "double")) && (strcmp(floatType, "double"))) {
		if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "double(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "(double)");
			//sprintf(convTypeRight, "");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "(double)");
			//sprintf(convTypeRight, "");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "(double)");
			//sprintf(convTypeRight, "");
#endif
		}
		else {
#if(VKFFT_BACKEND==0)
			sprintf(convTypeLeft, "dvec2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==1)
			sprintf(convTypeLeft, "conv_double2(");
			sprintf(convTypeRight, ")");
#elif(VKFFT_BACKEND==2)
			sprintf(convTypeLeft, "conv_double2(");
			sprintf(convTypeRight, ")");
#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))
			sprintf(convTypeLeft, "conv_double2(");
			sprintf(convTypeRight, ")");
#endif
		}
	}

	char index_x[2000] = "";
	char index_y[2000] = "";
	char requestCoordinate[100] = "";
	if (sc->convolutionStep) {
		if (sc->matrixConvolution > 1) {
			sprintf(requestCoordinate, "coordinate");
		}
	}
	char requestBatch[100] = "";
	if (sc->convolutionStep) {
		if (sc->numKernels > 1) {
			sprintf(requestBatch, "batchID");//if one buffer - multiple kernel convolution
		}
	}
	switch (writeType) {
	case 0: //single_c2c
	{
		if (!sc->writeFromRegisters) {
			res = appendBarrierVkFFT(sc, 1);
			if (res != VKFFT_SUCCESS) return res;
		}
		//res = appendZeropadStart(sc);
		//if (res != VKFFT_SUCCESS) return res;
		char shiftX[500] = "";
		if (sc->performWorkGroupShift[0])
			sprintf(shiftX, " + consts.workGroupShiftX ");
		char shiftY[500] = "";
		if (sc->axisSwapped) {
			if (sc->performWorkGroupShift[1])
				sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x);
		}
		else {
			if (sc->performWorkGroupShift[1])
				sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y);
		}

		char shiftY2[100] = "";
		if (sc->performWorkGroupShift[1])
			sprintf(shiftY, " + consts.workGroupShiftY ");
		if (sc->fftDim < sc->fft_dim_full) {
			if (sc->axisSwapped) {
				if (!sc->reorderFourStep) {
					sc->tempLen = sprintf(sc->tempStr, "		if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y);
					res = VkAppendLine(sc);
					if (res != VKFFT_SUCCESS) return 