CodeGuru Home VC++ / MFC / C++ .NET / C# Visual Basic VB Forums Developer.com
Results 1 to 4 of 4
  1. #1
    Join Date
    Oct 2017
    Posts
    50

    Why is my single thread code is faster than my multhread code?

    Just tried to implement multithreading rendering with vulkan and the result is dissappointing that multithreading code is 4x slower than singlethread. Tried multithreading on my cpu ray tracer code and works 10x faster than single. Why my vulkan multireaded rendering is slower?

    Here's the code :
    Code:
    void CommandBufferVulkan::DrawMultiThread(VkDevice device, PerThreadCommandBuffer cmdBuffer, RenderPassVulkan* pRenderPass, const std::vector<DrawIndexedMultiThreadInfo>* drawInfo, uint32_t firstIndex,
    		uint32_t lastIndex, uint32_t currentFrameIndex, std::vector<VkViewport>* viewports, std::vector<VkRect2D>* scissors)
    	{
    		vkResetCommandPool(device, cmdBuffer.CommandPool, 0);
    
    		VkCommandBufferInheritanceInfo inheriteInfo = {};
    		inheriteInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
    		inheriteInfo.renderPass = pRenderPass->GetRenderPassHandle();
    		inheriteInfo.framebuffer = pRenderPass->GetFrambufferHandle(currentFrameIndex);
    
    		VkCommandBufferBeginInfo cmdBufferBeginInfo = {};
    		cmdBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
    		cmdBufferBeginInfo.pInheritanceInfo = &inheriteInfo;
    		cmdBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
    
    		vkBeginCommandBuffer(cmdBuffer.CommandBuffer, &cmdBufferBeginInfo);
    		vkCmdSetViewport(cmdBuffer.CommandBuffer, 0, (uint32_t)viewports->size(), viewports->data());
    		vkCmdSetScissor(cmdBuffer.CommandBuffer, 0, (uint32_t)scissors->size(), scissors->data());
    
    		std::vector<VkDescriptorSet> globalSets;
    
    		const DrawIndexedMultiThreadInfo& di_1 = drawInfo[0][firstIndex];
    
    		for (uint32_t i = 0; i < di_1.PGlobalDescriptorSetBindInfo->PDescriptorSets.size(); ++i)
    			globalSets.push_back(di_1.PGlobalDescriptorSetBindInfo->PDescriptorSets[i]->GetDescriptorSetHandle());
    
    		vkCmdBindDescriptorSets(cmdBuffer.CommandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, di_1.PGlobalDescriptorSetBindInfo->PPipelineLayout->GetPipelineLayoutHandle(),
    			di_1.PGlobalDescriptorSetBindInfo->FirstSet, (uint32_t)globalSets.size(), globalSets.data(), 0, nullptr);
    
    		PipelineVulkan* pPipeline = nullptr;
    
    		for (uint32_t i = firstIndex; i < lastIndex; ++i)
    		{
    			const DrawIndexedMultiThreadInfo& dii = drawInfo[0][i];
    
    			if (dii.PPipelineVulkan != pPipeline)
    			{
    				VkPipeline pipeline = dii.PPipelineVulkan->GetPipelineHandle();
    				vkCmdBindPipeline(cmdBuffer.CommandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
    
    				pPipeline = dii.PPipelineVulkan;
    			}
    
    			std::vector<VkDescriptorSet> sets;
    
    			for (uint32_t j = 0; j < dii.DescriptorBindInfo.PDescriptorSets.size(); ++j)
    				sets.push_back(dii.DescriptorBindInfo.PDescriptorSets[j]->GetDescriptorSetHandle());
    
    			vkCmdBindDescriptorSets(cmdBuffer.CommandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, dii.DescriptorBindInfo.PPipelineLayout->GetPipelineLayoutHandle(),
    				dii.DescriptorBindInfo.FirstSet, (uint32_t)sets.size(), sets.data(), 0, nullptr);
    
    			vkCmdPushConstants(cmdBuffer.CommandBuffer, dii.DescriptorBindInfo.PPipelineLayout->GetPipelineLayoutHandle(), dii.PushConstantStage, 0, dii.PushConstantSize,
    				dii.PPushConstantData);
    
    			VkBuffer vertexBuffer = dii.PVertexBuffer->GetBufferHandle();
    			VkBuffer indexBuffer = dii.PIndexBuffer->GetBufferHandle();
    			VkDeviceSize offset = 0;
    
    			vkCmdBindVertexBuffers(cmdBuffer.CommandBuffer, 0, 1, &vertexBuffer, &offset);
    			vkCmdBindIndexBuffer(cmdBuffer.CommandBuffer, indexBuffer, 0, VK_INDEX_TYPE_UINT32);
    			vkCmdDrawIndexed(cmdBuffer.CommandBuffer, dii.IndexCount, 1, 0, 0, 0);
    		}
    
    		vkEndCommandBuffer(cmdBuffer.CommandBuffer);
    	}
    
    	void CommandBufferVulkan::DrawIndexedMultiThread(const std::vector<DrawIndexedMultiThreadInfo>& info)
    	{
    		if (info.size() < std::thread::hardware_concurrency())
    		{
    			printf("Draw count lower than num of thread!\n");
    			return;
    		}
    
    		uint32_t threadCount = std::thread::hardware_concurrency();
    		uint32_t drawCountPerThread = (uint32_t)(info.size() / threadCount);
    		uint32_t drawCountPerThreadMod = info.size() % threadCount;
    		uint32_t firstIndex = 0;
    		uint32_t lastIndex = drawCountPerThread;
    
    		VkDevice device = mpDeviceVulkan->GetDeviceHandle();
            
    		std::vector<std::thread> threads(threadCount);
    
    		for (uint32_t i = 0; i < threadCount; ++i)
    		{
    			if (i == (threadCount - 1))
    			{
    				lastIndex += drawCountPerThreadMod;
    				threads[i] =std::thread(DrawMultiThread, device, mPerThreadCommandBuffers[mCurrentFrame][i], mpCurrentRenderPass, &info, firstIndex, lastIndex, mCurrentFrame, &mCurrentViewports, &mCurrentScissors);
    			}
    			else
    			{
    				threads[i] = std::thread(DrawMultiThread, device, mPerThreadCommandBuffers[mCurrentFrame][i], mpCurrentRenderPass, &info, firstIndex, lastIndex, mCurrentFrame, &mCurrentViewports, &mCurrentScissors);
    				
    				firstIndex += drawCountPerThread;
    				lastIndex += drawCountPerThread;
    			}
    		}
    
    		for (uint32_t i = 0; i < threadCount; ++i)
    			threads[i].join();
    
    		std::vector<VkCommandBuffer> commandBuffers;
    
    		for (uint32_t i = 0; i < mPerThreadCommandBuffers[mCurrentFrame].size(); ++i)
    			commandBuffers.push_back(mPerThreadCommandBuffers[mCurrentFrame][i].CommandBuffer);
    
    		vkCmdExecuteCommands(mCurrentVkCommandBuffer, (uint32_t)commandBuffers.size(), commandBuffers.data());
    	}
    Here are the results :
    Multithreading
    Name:  mt.jpg
Views: 934
Size:  77.7 KB

    Singlethreading
    Name:  st.jpg
Views: 672
Size:  50.6 KB

  2. #2
    2kaud's Avatar
    2kaud is offline Super Moderator Power Poster
    Join Date
    Dec 2012
    Location
    England
    Posts
    7,822

    Re: Why is my single thread code is faster than my multhread code?

    I don't know Vulcan so can only generalise.

    It is a known fact that multi-threaded code can be slower than single-threaded. There are a couple of reasons for this. One is that it takes time to set-up a new thread. Keep creating and destroying threads can have a real detrimental effect on performance. For multi-threaded to be effective, the time taken for the 'work' of the thread should be much greater than the time to create/destroy the thread.

    Another reason is synchronization. If the thread code has to continuously use synchronization (or atomic etc) to execute some code or to update some variables then this again would decrease the multi-threaded performance as time is 'wasted' waiting for the sync/update.
    All advice is offered in good faith only. All my code is tested (unless stated explicitly otherwise) with the latest version of Microsoft Visual Studio (using the supported features of the latest standard) and is offered as examples only - not as production quality. I cannot offer advice regarding any other c/c++ compiler/IDE or incompatibilities with VS. You are ultimately responsible for the effects of your programs and the integrity of the machines they run on. Anything I post, code snippets, advice, etc is licensed as Public Domain https://creativecommons.org/publicdomain/zero/1.0/ and can be used without reference or acknowledgement. Also note that I only provide advice and guidance via the forums - and not via private messages!

    C++23 Compiler: Microsoft VS2022 (17.6.5)

  3. #3
    Join Date
    Oct 2017
    Posts
    50

    Re: Why is my single thread code is faster than my multhread code?

    So if creating new thread is costly then i should use a thread pool right? can you recommend a good thread pool library and a free profiler tool to check function execution time

  4. #4
    Join Date
    Feb 2017
    Posts
    677

    Re: Why is my single thread code is faster than my multhread code?

    Quote Originally Posted by noobofcpp View Post
    So if creating new thread is costly then i should use a thread pool right? can you recommend a good thread pool library and a free profiler tool to check function execution time
    That may speed things up on the CPU side, but you probably should do something to that effect on the GPU side too. Maybe by pooling the command buffers,

    https://community.arm.com/arm-commun...and-management

    It is quite tricky to get multi-threading right. Sometimes it is not enough to divide up what you did sequentially and do it in parallel. It often requires a more substantial restructuring. The best is to design for multi-threading right from the start. Easiest is to base it on a complete example program from some trusted source.

    Visual Studio has a profiler,

    https://docs.microsoft.com/en-us/vis.../?view=vs-2019
    Last edited by wolle; November 17th, 2021 at 03:00 AM.

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  





Click Here to Expand Forum to Full Width

Featured