I have a functioning quicksort algorithm, that I am trying to make multithreaded. I know how to divide the array into parts and have multiple threads sort their own section, but how would I go about sorting each part into the final sorted array? Here's my Quicksort functions that are single threaded at the moment.

Code:
template <typename Type> void qsort_impl(Type * left, Type * right, long (*CompareFunction)(const Type &, const Type &), void (*SwapFunction)(Type &, Type &))
		{
			Type * leftbound = left;
			Type * rightbound = right;
			Type pivot = left[(right - left) / 2];

			for (;;)
			{
				while ((*CompareFunction)(*leftbound, pivot) < 0)
				{
					++leftbound;
				}

				while ((*CompareFunction)(*rightbound, pivot) > 0)
				{
					--rightbound;
				}

				while ((leftbound != rightbound) && ((*CompareFunction)(*leftbound, *rightbound) == 0))
				{
					++leftbound;
				}

				if (leftbound == rightbound)
				{
					break;
				}

				(*SwapFunction)(*leftbound, *rightbound);
			}

			if (left < --leftbound)
			{
				qsort_impl(left, leftbound, CompareFunction, SwapFunction);
			}

			if (right > ++rightbound)
			{
				qsort_impl(rightbound, right, CompareFunction, SwapFunction);
			}
		}

		template <typename Type> void Quick_Sort(Type * pArray, unsigned long dwCount, long(*CompareFunction)(const Type &, const Type &),
			(*SwapFunction)(Type &, Type &))
		{
			qsort_impl(pArray, pArray + dwCount - 1, CompareFunction, SwapFunction);
		}