Here is a demo with a more efficient Qsort() function that I re-write in plain C. As shown by this example code. It easily beats the most efficient std::sort() based code.
Code:
#include <vector>
#include <algorithm>
#include <functional>
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <string>

#define CUTOFF 8

using namespace std;

// This is Anthony's improved version of qsort
void __cdecl Qsort (
    void *base,
    unsigned num,
    unsigned width,
    int (__cdecl *comp)(const void *, const void *)
    );

class SomeClass
{
   public:
     std::string fld1;
     std::string fld2;
     std::string fld3;
     std::string fld4;
};

struct SortFld
{
    bool operator() (const SomeClass& first, const SomeClass& second)
    {
        return strcmp(first.fld1.c_str(), second.fld1.c_str())<0?true:false;
    }
};


typedef vector<SomeClass> intvec;

void randomise_vector(intvec& v)
{
    srand(time(0));
    char szBuf[15];
    for (intvec::iterator i = v.begin(); i != v.end(); ++i)
    {
        sprintf(szBuf, "%d", rand());
        (*i).fld1 = szBuf;
    }
}

inline bool stllessfunc(const SomeClass& lhs, const SomeClass& rhs)
{
        return strcmp(lhs.fld1.c_str(), rhs.fld1.c_str())<0?true:false;
}

inline int qscompare(const void* lhs, const void* rhs)
{
    return strcmp(((SomeClass *)lhs)->fld1.c_str(), ((SomeClass*)rhs)->fld1.c_str());
}


int main()
{
    int     i, total = 999999;
    intvec v(total+1);

    randomise_vector(v);

    cout << "std::sort using functor: ";
    clock_t start = clock();
    sort(v.begin(), v.end(), SortFld());
    clock_t stop = clock();
    cout << stop - start << " ticks" << endl;

    randomise_vector(v);

    cout << "std::sort using function: ";
    start = clock();
    sort(v.begin(), v.end(), stllessfunc);
    stop = clock();
    cout << stop - start << " ticks" << endl;

    randomise_vector(v);
    cout << "qsort: ";
    start = clock();
    Qsort(static_cast<void*>(&v[0]), 1000000, sizeof(SomeClass), qscompare);
    stop = clock();
    cout << stop - start << " ticks" << endl;

    for (i=0; i<999998; i++)
    {
        if (strcmp(v[i].fld1.c_str(), v[i+1].fld1.c_str()) > 0)
        {
            break;
        }
    }
    if (i < 999998)
    {
        cout << "There is an error in Qsort" << endl;
    }
    else
    {
        cout << "There is NO error in Qsort" << endl;
    }

    cout << "Test done" << endl;

    return 0;
}


// This is Anthony's improved version of qsort  -A.Mai 7/18/2002
void __cdecl Qsort (
    void *base,
    unsigned num,
    unsigned width,
    int (__cdecl *comp)(const void *, const void *)
    )
{
    char *lo, *hi;
    char *mid;
    char *loguy, *higuy;
    unsigned size;
    char *lostk[30], *histk[30];
    int stkptr;

    if (num < 2 || width == 0)
    {
        return;
    }

    stkptr = 0;

    lo = (char*)base;
    hi = (char *)base + width * (num-1);

recurse:

    size = (hi - lo) / width + 1;

    if (size > CUTOFF)
    {
        mid = lo + (size / 2) * width;

		{
			register int w2 = width;

            if (w2 < 4)
            {
                do
			    {
                    char    tmp;

                    w2 --;
				    tmp = mid[w2];
				    mid[w2] = lo[w2];
				    lo[w2] = tmp;
			    } while (w2 > 0);
            }
            else
            {
				int tmp;
                
                tmp = *(int*)(&mid[w2-4]);
				*(int*)(&mid[w2-4]) = *(int*)(&lo[w2-4]);
				*(int*)(&lo[w2-4]) = tmp;
                w2--;
                w2 &= -4;
			    while (w2 >= 4)
			    {
				    w2 -= 4;
				    tmp = *(int*)(&mid[w2]);
				    *(int*)(&mid[w2]) = *(int*)(&lo[w2]);
				    *(int*)(&lo[w2]) = tmp;
			    }
            }
		}

        loguy = lo;
        higuy = hi + width;

        for (;;)
		{
            do  {
                loguy += width;
            } while (loguy <= hi && comp(loguy, lo) <= 0);

            do  {
                higuy -= width;
            } while (higuy > lo && comp(higuy, lo) >= 0);

            if (higuy < loguy)
			{
                break;
			}

			{
				register int w2 = width;

                if (w2 < 4)
                {
                    do
				    {
                        char    tmp;

                        w2 --;
					    tmp = loguy[w2];
					    loguy[w2] = higuy[w2];
					    higuy[w2] = tmp;
				    } while (w2 > 0);
                }
                else
                {
					int tmp;
                    
                    tmp = *(int*)(&loguy[w2-4]);
					*(int*)(&loguy[w2-4]) = *(int*)(&higuy[w2-4]);
					*(int*)(&higuy[w2-4]) = tmp;
                    w2--;
                    w2 &= -4;
				    while (w2 >= 4)
				    {
					    w2 -= 4;
					    tmp = *(int*)(&loguy[w2]);
					    *(int*)(&loguy[w2]) = *(int*)(&higuy[w2]);
					    *(int*)(&higuy[w2]) = tmp;
				    }
                }
			}
        }

		{
			register int w2 = width;

            if (w2 < 4)
            {
                do
			    {
                    char    tmp;
                    w2 --;
				    tmp = lo[w2];
				    lo[w2] = higuy[w2];
				    higuy[w2] = tmp;
			    } while (w2 > 0);
            }
            else
            {
                int tmp;

				tmp = *(int*)(&lo[w2-4]);
				*(int*)(&lo[w2-4]) = *(int*)(&higuy[w2-4]);
				*(int*)(&higuy[w2-4]) = tmp;
                w2--;
                w2 &= -4;
			    while (w2 >= 4)
			    {
				    w2 -= 4;
				    int tmp = *(int*)(&lo[w2]);
				    *(int*)(&lo[w2]) = *(int*)(&higuy[w2]);
				    *(int*)(&higuy[w2]) = tmp;
			    }
            }
		}

        if ( higuy - 1 - lo >= hi - loguy )
		{
            if (lo + width < higuy)
			{
                lostk[stkptr] = lo;
                histk[stkptr] = higuy - width;
                ++stkptr;
            }

            if (loguy < hi)
			{
                lo = loguy;
                goto recurse;
            }
        }
        else
		{
            if (loguy < hi)
			{
                lostk[stkptr] = loguy;
                histk[stkptr] = hi;
                ++stkptr;
            }

            if (lo + width < higuy)
			{
                hi = higuy - width;
                goto recurse;
            }
        }
    }
    else
    {
		char *p, *max;

		while (hi > lo)
        {
			max = lo;
			for (p = lo+width; p <= hi; p += width)
            {
				if (comp(p, max) > 0)
                {
					max = p;
				}
			}

			if ( max != hi )
			{
				register int w2 = width;

				if (w2 < 4)
				{
					do {
                        char tmp;

                        w2 --;
						tmp = max[w2];
						max[w2] = hi[w2];
						hi[w2] = tmp;
					} while (w2 > 0);
				}
				else
				{
					int tmp;
                    
					tmp = *(int*)(&max[w2-4]);
					*(int*)(&max[w2-4]) = *(int*)(&hi[w2-4]);
					*(int*)(&hi[w2-4]) = tmp;
                    w2--;
                    w2 &= -4;
				    while (w2 >= 4)
				    {
					    w2 -= 4;
					    tmp = *(int*)(&max[w2]);
					    *(int*)(&max[w2]) = *(int*)(&hi[w2]);
					    *(int*)(&hi[w2]) = tmp;
				    }
				}
			}

			hi -= width;
		}
    }


    --stkptr;
    if (stkptr >= 0)
	{
        lo = lostk[stkptr];
        hi = histk[stkptr];
        goto recurse;
    }

	return;
}