Just tried to implement multithreading rendering with vulkan and the result is dissappointing that multithreading code is 4x slower than singlethread. Tried multithreading on my cpu ray tracer code and works 10x faster than single. Why my vulkan multireaded rendering is slower?

Here's the code :
Code:
void CommandBufferVulkan::DrawMultiThread(VkDevice device, PerThreadCommandBuffer cmdBuffer, RenderPassVulkan* pRenderPass, const std::vector<DrawIndexedMultiThreadInfo>* drawInfo, uint32_t firstIndex,
		uint32_t lastIndex, uint32_t currentFrameIndex, std::vector<VkViewport>* viewports, std::vector<VkRect2D>* scissors)
	{
		vkResetCommandPool(device, cmdBuffer.CommandPool, 0);

		VkCommandBufferInheritanceInfo inheriteInfo = {};
		inheriteInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
		inheriteInfo.renderPass = pRenderPass->GetRenderPassHandle();
		inheriteInfo.framebuffer = pRenderPass->GetFrambufferHandle(currentFrameIndex);

		VkCommandBufferBeginInfo cmdBufferBeginInfo = {};
		cmdBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
		cmdBufferBeginInfo.pInheritanceInfo = &inheriteInfo;
		cmdBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;

		vkBeginCommandBuffer(cmdBuffer.CommandBuffer, &cmdBufferBeginInfo);
		vkCmdSetViewport(cmdBuffer.CommandBuffer, 0, (uint32_t)viewports->size(), viewports->data());
		vkCmdSetScissor(cmdBuffer.CommandBuffer, 0, (uint32_t)scissors->size(), scissors->data());

		std::vector<VkDescriptorSet> globalSets;

		const DrawIndexedMultiThreadInfo& di_1 = drawInfo[0][firstIndex];

		for (uint32_t i = 0; i < di_1.PGlobalDescriptorSetBindInfo->PDescriptorSets.size(); ++i)
			globalSets.push_back(di_1.PGlobalDescriptorSetBindInfo->PDescriptorSets[i]->GetDescriptorSetHandle());

		vkCmdBindDescriptorSets(cmdBuffer.CommandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, di_1.PGlobalDescriptorSetBindInfo->PPipelineLayout->GetPipelineLayoutHandle(),
			di_1.PGlobalDescriptorSetBindInfo->FirstSet, (uint32_t)globalSets.size(), globalSets.data(), 0, nullptr);

		PipelineVulkan* pPipeline = nullptr;

		for (uint32_t i = firstIndex; i < lastIndex; ++i)
		{
			const DrawIndexedMultiThreadInfo& dii = drawInfo[0][i];

			if (dii.PPipelineVulkan != pPipeline)
			{
				VkPipeline pipeline = dii.PPipelineVulkan->GetPipelineHandle();
				vkCmdBindPipeline(cmdBuffer.CommandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);

				pPipeline = dii.PPipelineVulkan;
			}

			std::vector<VkDescriptorSet> sets;

			for (uint32_t j = 0; j < dii.DescriptorBindInfo.PDescriptorSets.size(); ++j)
				sets.push_back(dii.DescriptorBindInfo.PDescriptorSets[j]->GetDescriptorSetHandle());

			vkCmdBindDescriptorSets(cmdBuffer.CommandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, dii.DescriptorBindInfo.PPipelineLayout->GetPipelineLayoutHandle(),
				dii.DescriptorBindInfo.FirstSet, (uint32_t)sets.size(), sets.data(), 0, nullptr);

			vkCmdPushConstants(cmdBuffer.CommandBuffer, dii.DescriptorBindInfo.PPipelineLayout->GetPipelineLayoutHandle(), dii.PushConstantStage, 0, dii.PushConstantSize,
				dii.PPushConstantData);

			VkBuffer vertexBuffer = dii.PVertexBuffer->GetBufferHandle();
			VkBuffer indexBuffer = dii.PIndexBuffer->GetBufferHandle();
			VkDeviceSize offset = 0;

			vkCmdBindVertexBuffers(cmdBuffer.CommandBuffer, 0, 1, &vertexBuffer, &offset);
			vkCmdBindIndexBuffer(cmdBuffer.CommandBuffer, indexBuffer, 0, VK_INDEX_TYPE_UINT32);
			vkCmdDrawIndexed(cmdBuffer.CommandBuffer, dii.IndexCount, 1, 0, 0, 0);
		}

		vkEndCommandBuffer(cmdBuffer.CommandBuffer);
	}

	void CommandBufferVulkan::DrawIndexedMultiThread(const std::vector<DrawIndexedMultiThreadInfo>& info)
	{
		if (info.size() < std::thread::hardware_concurrency())
		{
			printf("Draw count lower than num of thread!\n");
			return;
		}

		uint32_t threadCount = std::thread::hardware_concurrency();
		uint32_t drawCountPerThread = (uint32_t)(info.size() / threadCount);
		uint32_t drawCountPerThreadMod = info.size() % threadCount;
		uint32_t firstIndex = 0;
		uint32_t lastIndex = drawCountPerThread;

		VkDevice device = mpDeviceVulkan->GetDeviceHandle();
        
		std::vector<std::thread> threads(threadCount);

		for (uint32_t i = 0; i < threadCount; ++i)
		{
			if (i == (threadCount - 1))
			{
				lastIndex += drawCountPerThreadMod;
				threads[i] =std::thread(DrawMultiThread, device, mPerThreadCommandBuffers[mCurrentFrame][i], mpCurrentRenderPass, &info, firstIndex, lastIndex, mCurrentFrame, &mCurrentViewports, &mCurrentScissors);
			}
			else
			{
				threads[i] = std::thread(DrawMultiThread, device, mPerThreadCommandBuffers[mCurrentFrame][i], mpCurrentRenderPass, &info, firstIndex, lastIndex, mCurrentFrame, &mCurrentViewports, &mCurrentScissors);
				
				firstIndex += drawCountPerThread;
				lastIndex += drawCountPerThread;
			}
		}

		for (uint32_t i = 0; i < threadCount; ++i)
			threads[i].join();

		std::vector<VkCommandBuffer> commandBuffers;

		for (uint32_t i = 0; i < mPerThreadCommandBuffers[mCurrentFrame].size(); ++i)
			commandBuffers.push_back(mPerThreadCommandBuffers[mCurrentFrame][i].CommandBuffer);

		vkCmdExecuteCommands(mCurrentVkCommandBuffer, (uint32_t)commandBuffers.size(), commandBuffers.data());
	}
Here are the results :
Multithreading
Name:  mt.jpg
Views: 946
Size:  77.7 KB

Singlethreading
Name:  st.jpg
Views: 681
Size:  50.6 KB