[Solved] std stack performance issues [closed]

Question

The many comments (and even answers) focus on the risks in your implementation. Yet the question stands.

As directly demonstrated below rectifying the perceived code shortcomings would not change anything significant about the performance.

Here is the OP’s code modified to be (A) safe, and (B) supporting the same operations as std::stack, and (C) reserving buffer space also for the std::stack, in order to clarify things for those who mistakenly believe that this stuff matters for the performance:

#define _SECURE_SCL 0
#define _SCL_SECURE_NO_WARNINGS

#include <algorithm>        // std::swap
#include <iostream>
#include <vector>
#include <stack>
#include <stddef.h>         // ptrdiff_t
#include <type_traits>      // std::is_pod
using namespace std;

#undef UNICODE
#define UNICODE
#include <Windows.h>

typedef ptrdiff_t   Size;
typedef Size        Index;

template< class Type, class Container >
void reserve( Size const newBufSize, std::stack< Type, Container >& st )
{
    struct Access: std::stack< Type, Container >
    {
        static Container& container( std::stack< Type, Container >& st )
        {
            return st.*&Access::c;
        }
    };

    Access::container( st ).reserve( newBufSize );
}

class HighResolutionTimer
{
public:
    HighResolutionTimer();
    double GetFrequency() const;
    void Start() ;
    double Stop();
    double GetTime() const;

private:
    LARGE_INTEGER start;
    LARGE_INTEGER stop;
    double frequency;
};

HighResolutionTimer::HighResolutionTimer()
{
    frequency = GetFrequency();
}

double HighResolutionTimer::GetFrequency() const
{
    LARGE_INTEGER proc_freq;
    if (!::QueryPerformanceFrequency(&proc_freq))
        return -1;
    return static_cast< double >( proc_freq.QuadPart );
}

void HighResolutionTimer::Start()
{
    DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
    ::QueryPerformanceCounter(&start);
    ::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
}

double HighResolutionTimer::Stop()
{
    DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
    ::QueryPerformanceCounter(&stop);
    ::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
    return ((stop.QuadPart - start.QuadPart) / frequency);
} 

double HighResolutionTimer::GetTime() const
{
    LARGE_INTEGER time;
    ::QueryPerformanceCounter(&time);
    return time.QuadPart / frequency;
}

template< class Type, bool elemTypeIsPOD = !!std::is_pod< Type >::value >
class FastStack;

template< class Type >
class FastStack< Type, true >
{
private:
    Type*   st_;
    Index   lastIndex_;
    Size    capacity_;

public:
    Size const size() const { return lastIndex_ + 1; }
    Size const capacity() const { return capacity_; }

    void reserve( Size const newCapacity )
    {
        if( newCapacity > capacity_ )
        {
            FastStack< Type >( *this, newCapacity ).swapWith( *this );
        }
    }

    void push( Type const& x )
    {
        if( size() == capacity() )
        {
            reserve( 2*capacity() );
        }
        st_[++lastIndex_] = x;
    }

    void pop()
    {
        --lastIndex_;
    }

    Type top() const
    {
        return st_[lastIndex_];
    }

    void swapWith( FastStack& other ) throw()
    {
        using std::swap;
        swap( st_, other.st_ );
        swap( lastIndex_, other.lastIndex_ );
        swap( capacity_, other.capacity_ );
    }

    void operator=( FastStack other )
    {
        other.swapWith( *this );
    }

    ~FastStack()
    {
        delete[] st_;
    }

    FastStack( Size const aCapacity = 0 )
        : st_( new Type[aCapacity] )
        , capacity_( aCapacity )
    {
        lastIndex_ = -1;
    }

    FastStack( FastStack const& other, int const newBufSize = -1 )
    {
        capacity_ = (newBufSize < other.size()? other.size(): newBufSize);
        st_ = new Type[capacity_];
        lastIndex_ = other.lastIndex_;
        copy( other.st_, other.st_ + other.size(), st_ );   // Can't throw for POD.
    }
};

template< class Type >
void reserve( Size const newCapacity, FastStack< Type >& st )
{
    st.reserve( newCapacity );
}

template< class StackType >
void test( char const* const description )
{
    for( int it = 0; it < 4; ++it )
    {
        StackType st;
        reserve( 200, st );

        // after this two loops, st's capacity will be 141 so there will be no more reallocating
        for( int i = 0; i < 100; ++i ) { st.push( i ); }
        for( int i = 0; i < 100; ++i ) { st.pop(); }

        // when you uncomment this line, std::stack performance will magically rise about 18%
        // std::vector<int> magicVector(10);

        HighResolutionTimer timer;
        timer.Start();

        for( Index i = 0; i < 1000000000; ++i )
        {
            st.push( i );
            (void) st.top();
            if( i % 100 == 0 && i != 0 )
            {
                for( int j = 0; j < 100; ++j ) { st.pop(); }
            }
        }

        double const totalTime = timer.Stop();
        wcout << description << ": "  << totalTime << endl;
    }
}

int main()
{
    typedef stack< Index, vector< Index > > SStack;
    typedef FastStack< Index >              FStack;

    test< SStack >( "std::stack" );
    test< FStack >( "FastStack" );

    cout << "Done";
}

Results on this slow-as-molasses Samsung RC530 laptop:

[D:\dev\test\so\12704314]
> a
std::stack: 3.21319
std::stack: 3.16456
std::stack: 3.23298
std::stack: 3.20854
FastStack: 1.97636
FastStack: 1.97958
FastStack: 2.12977
FastStack: 2.13507
Done
[D:\dev\test\so\12704314]
> _

And similarly for Visual C++.

Now let’s look at a typical implementation of std::vector::push_back, which is called by std::stack<T, std::vector<T>>::push (in passing, I know of only 3 programmers who have ever used this indentation style, namely PJP, Petzold and myself; I now, since 1998 or thereabouts, think it’s horrible!):

void push_back(const value_type& _Val)
    {   // insert element at end
    if (_Inside(_STD addressof(_Val)))
        {   // push back an element
        size_type _Idx = _STD addressof(_Val) - this->_Myfirst;
        if (this->_Mylast == this->_Myend)
            _Reserve(1);
        _Orphan_range(this->_Mylast, this->_Mylast);
        this->_Getal().construct(this->_Mylast,
            this->_Myfirst[_Idx]);
        ++this->_Mylast;
        }
    else
        {   // push back a non-element
        if (this->_Mylast == this->_Myend)
            _Reserve(1);
        _Orphan_range(this->_Mylast, this->_Mylast);
        this->_Getal().construct(this->_Mylast,
            _Val);
        ++this->_Mylast;
        }
    }

I suspect that the measured inefficiency lies at least partly in all the stuff going on there, and perhaps it’s also a matter of automatically generated safety checks.

For a debug build the std::stack performance is so extremely ungood that I gave up waiting for any result.

EDIT: following Xeo’s comment below I updated push to check for “self-push” in the case of buffer reallocation, by factoring that out as a separate function:

void push( Type const& x )
{
    if( size() == capacity() )
    {
        reserveAndPush( x );
    }
    st_[++lastIndex_] = x;
}

Mysteriously, although reserveAndPush is never called in this testing, it affects the performance – due to code size not fitting cache?

[D:\dev\test\so\12704314]
> a
std::stack: 3.21623
std::stack: 3.30501
std::stack: 3.24337
std::stack: 3.27711
FastStack: 2.52791
FastStack: 2.44621
FastStack: 2.44759
FastStack: 2.47287
Done
[D:\dev\test\so\12704314]
> _

EDIT 2: DeadMG showed that the code must be buggy. I believe the problem was a missing return, plus the expression computing new size (twice zero is still zero). He also pointed out that I forgot to show reserveAndPush. Should be:

void reserveAndPush( Type const& x )
{
    Type const xVal = x;
    reserve( capacity_ == 0? 1 : 2*capacity_ );
    push( xVal );
}

void push( Type const& x )
{
    if( size() == capacity() )
    {
        return reserveAndPush( x );    // <-- The crucial "return".
    }
    st_[++lastIndex_] = x;
}

Accepted Answer