question about sorting on bitstrings - Page 2
CodeGuru Home VC++ / MFC / C++ .NET / C# Visual Basic VB Forums Developer.com
Page 2 of 2 FirstFirst 12
Results 16 to 17 of 17

Thread: question about sorting on bitstrings

  1. #16
    Join Date
    May 2009
    Location
    Boston
    Posts
    254

    Re: question about sorting on bitstrings

    I have made a few modifications to make the tool more widget like. It now reads the subset size from the command line and takes the data from an input file.

    Code:
    // file sort3_02.cpp based from code by razzle
    // http://forums.codeguru.com/member.php?419897-razzle
    
    
    #include <cstdlib>
    #include <list>
    #include <vector>
    #include <string>
    #include <bitset>
    #include <iostream>
    #include <algorithm>
    #include <unistd.h>
    #include <fstream>
    
    const int strsize = 18;
    
    
    inline int next(int x) { // next K subset of an N bitset (gosper's hack (hakmem 175))
       int s = x & -x;
       int r = x + s;
       return r | (((x^r)>>2)/s);
    }
    
    inline int start(int k) { // first subset
       return (1<<k) - 1;
    }
    
    inline int limit(int n) { // first non-subset
       return 1<<n;
    }
    
    int to_set(const std::string& s) { // string to bitset
       return int(std::bitset<strsize>(s).to_ulong());
    }
    
    std::string to_string(int x) { // bitset to string
       return std::bitset<strsize>(x).to_string();
    }
    
    
    
    int main(int argc, char **argv) {
    
    // declarations for args-----------------------------------------------
    
       // command line arguments
       int cmd_arguments;
       // subset size
       int K = 0;
       std::string str_K;
       // input file read to data[]
       std::string input_file;
    
       //  Parse command line arguments
       while ((cmd_arguments = getopt (argc, argv, "s:i:h:")) != -1)
          switch (cmd_arguments)
       {
          case 's':
             str_K = optarg;
             K = atoi(str_K.c_str());
          break;
          case 'i':
             input_file = optarg;
          break;
          case 'h':
             // print help blurb
             std::cout << std::endl;
             std::cout << "use argument -i to specify input file" << std::endl;
             std::cout << "use argument -s to specify subset size" << std::endl;
             std::cout << std::endl;
             exit(0);
          break;
       }
    
       // check args
       if(input_file.length() == 0) {
          std::cerr << "  -> no input file was specified, use -i" << std::endl;
          exit(-1);
       }
       if(str_K.length() == 0) {
          std::cerr << "  -> subset size was specified, use -s" << std::endl;
          exit(-2);
       }
    
    //---------------------------------------------------------------------
    
       // create an input stream and open the input_file file
       std::ifstream input_file_ifstream;
       input_file_ifstream.open( input_file.c_str() );
    
       // check if input_file was opened
       if(!input_file_ifstream.is_open()) {
          std::cerr << "file  " << input_file << "  could not be opened" << std::endl;
          exit(-3);
       }
    
       // save data from file in vector
       std::vector<std::string> inputs_from_file;
       std::string new_input_line;
    
       while( getline(input_file_ifstream, new_input_line) ) {
          // remove '\r' EOL chars and tab characters
          new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\r'), new_input_line.end());
          new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\t'), new_input_line.end());
          // add line to vector of string
          inputs_from_file.push_back(new_input_line);
       }
    
       // close istream
       input_file_ifstream.close();
    
    
       // size of user sets (string length)
       const int N = strsize; 
    
       std::list<int> usersets; // all user sets converted to bitsets
       for (std::string s : inputs_from_file) usersets.push_back(to_set(s));
    
       std::vector<int> counts; // one counter per subset
       int b = 0; // number of K subsets of an N set (the N over K binominal)
       for (int s=start(K); s<limit(N); s=next(s)) ++b;
       std::cout << "*** binominal = " << b << std::endl;
       counts.resize(b);
    
       bool quit = false;
       do {
          for (int& c : counts) c = 0; // reset counters
    
          // count all user sets for every subset 
          for (int s=start(K), j=0; s<limit(N); s=next(s), ++j) {
             for (int set : usersets) {
                if ((set & s) == s) ++counts[j];
             }
          }
    
          // find max counter (biggest run)
          int runSet = 0;
          for (int s=start(K), j=0, m=0; s<limit(N); s=next(s), ++j) {
             if (counts[j] > m) {
                m = counts[j];
                runSet = s;
             }
          }
    
          // print all user sets belonging to this run
          if (runSet > 0) {
             // gather output sets
             std::vector<int> out;
             for (auto sp = usersets.begin(); sp != usersets.end();) {
                if ((*sp & runSet) == runSet) {
                   out.push_back(*sp);
                   sp = usersets.erase(sp); // remove printed user set
                } else ++sp;
             }
    
             //actual output
             std::sort(out.rbegin(),out.rend());  // from largest to smallest
             std::cout << to_string(runSet) << " *" << std::endl;
             for (int x : out) {
                std::cout << to_string(x) << " : " << x << std::endl;
             }
          } else quit = true; // no more runs
    
          std::cout << std::endl;
    
       } while (!quit);
    
       // print all remaining user sets
       if (usersets.size() > 0) {
          std::cout << "*** remaining sets:" << std::endl;
          for (int x : usersets) std::cout << to_string(x) << std::endl;
       }
    
    // main EOF
    }
    This was compiled an built with g++ 4.8 using,
    g++ -O2 -c -std=c++11 sort3_02.cpp
    g++ -std=c++11 -o sort3_02.exe sort3_02.o

    The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.

    I am going to get this learning on the data set I have now and then work on making the subset size look up recursive, meaning the for 8 on bits, the sorting will start with subset size of 7. All strings that are not part of a run of at least 2 will be re-analyzed with a subset size of 6, then 5 etc. This will repeat until all strings have been assigned to a run, or there are only two unassigned strings remaining.

    LMHmedchem

  2. #17
    Join Date
    Jul 2013
    Posts
    256

    Re: question about sorting on bitstrings

    Quote Originally Posted by LMHmedchem View Post
    The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.
    Just for the sake of it I've improved on the algorithm somewhat. It's based on the observation that probably not all subsets are actually used. So I remove the subsets that are not present in any 18-bit string and store the rest in a vector called subsets. All subsets are scanned just once at the beginning (using the full gosper loop). After that the (hopefully fewer) subsets in the subsets vector are used.

    And indeed, with the original test strings the reduction is from 816 to 123 subsets. That should translate to almost 10 times faster code. The optimization will depend on the nature of the 18-bit strings so nothing is guaranteed but at least it will not slow down the program.

    Also the memory requirements have been reduced. The biggest memory consumer was the counts vector of size 816. After the optimization there are two vectors namely counts and subsets of the same size 123. It's a decrease from 812 before to 246 after the optimization. But if unfortunately no subset reduction took place the memory requirements will instead double, increasing from 816 to 1632. That's the worst case cost of the optimization.

    The optimization is limited to one section of my original code. If you want to use it just replace the old section with the new section below. Good luck with your research.

    Changed code section:

    Code:
    // ---
    	std::vector<int> counts; // one counter per subset
    	std::vector<int> subsets; // sequence of actually used subsets
    	int b = 0; // number of K subsets of an N set (the N over K binominal)
    	for (int s=start(K); s<limit(N); s=next(s)) { // full subset (gosper) loop
    		for (int set : usersets) {
    			if ((set & s) == s) {
    				subsets.push_back(s);  // subset is in use
    				break;
    			}
    		}
    		++b;
    	}
    	counts.resize(subsets.size());
    
    	std::cout << "*** all subsets (the binominal) = " << b << std::endl;
    	std::cout << "*** subsets in use = " << subsets.size() << std::endl;
    
    	bool quit = false;
    	do {
    		for (int& c : counts) c = 0; // reset counters
    
    			// count all user sets for every subset 
    		for (int j=0, J=subsets.size(); j<J; ++j) { // reduced subset loop
    			int s = subsets[j];
    			int& c = counts[j];
    			for (int set : usersets) {
    				if ((set & s) == s) ++c; // increment subset counter
    			}
    		}
    
    			// find max counter (biggest run)
    		int runSet = 0;
    		for (int j=0, J=subsets.size(), m=0; j<J; ++j) {
    			if (counts[j] > m) {
    				m = counts[j];
    				runSet = subsets[j];
    			}
    		}
    // ---
    Last edited by razzle; April 5th, 2014 at 11:58 PM.

Page 2 of 2 FirstFirst 12

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  


Azure Activities Information Page

Windows Mobile Development Center


Click Here to Expand Forum to Full Width

This is a CodeGuru survey question.


Featured


HTML5 Development Center