question about sorting on bitstrings

**LMHmedchem** · April 3rd, 2014, 04:20 PM

I have made a few modifications to make the tool more widget like. It now reads the subset size from the command line and takes the data from an input file.

Code:

// file sort3_02.cpp based from code by razzle
// http://forums.codeguru.com/member.php?419897-razzle


#include <cstdlib>
#include <list>
#include <vector>
#include <string>
#include <bitset>
#include <iostream>
#include <algorithm>
#include <unistd.h>
#include <fstream>

const int strsize = 18;


inline int next(int x) { // next K subset of an N bitset (gosper's hack (hakmem 175))
   int s = x & -x;
   int r = x + s;
   return r | (((x^r)>>2)/s);
}

inline int start(int k) { // first subset
   return (1<<k) - 1;
}

inline int limit(int n) { // first non-subset
   return 1<<n;
}

int to_set(const std::string& s) { // string to bitset
   return int(std::bitset<strsize>(s).to_ulong());
}

std::string to_string(int x) { // bitset to string
   return std::bitset<strsize>(x).to_string();
}



int main(int argc, char **argv) {

// declarations for args-----------------------------------------------

   // command line arguments
   int cmd_arguments;
   // subset size
   int K = 0;
   std::string str_K;
   // input file read to data[]
   std::string input_file;

   //  Parse command line arguments
   while ((cmd_arguments = getopt (argc, argv, "s:i:h:")) != -1)
      switch (cmd_arguments)
   {
      case 's':
         str_K = optarg;
         K = atoi(str_K.c_str());
      break;
      case 'i':
         input_file = optarg;
      break;
      case 'h':
         // print help blurb
         std::cout << std::endl;
         std::cout << "use argument -i to specify input file" << std::endl;
         std::cout << "use argument -s to specify subset size" << std::endl;
         std::cout << std::endl;
         exit(0);
      break;
   }

   // check args
   if(input_file.length() == 0) {
      std::cerr << "  -> no input file was specified, use -i" << std::endl;
      exit(-1);
   }
   if(str_K.length() == 0) {
      std::cerr << "  -> subset size was specified, use -s" << std::endl;
      exit(-2);
   }

//---------------------------------------------------------------------

   // create an input stream and open the input_file file
   std::ifstream input_file_ifstream;
   input_file_ifstream.open( input_file.c_str() );

   // check if input_file was opened
   if(!input_file_ifstream.is_open()) {
      std::cerr << "file  " << input_file << "  could not be opened" << std::endl;
      exit(-3);
   }

   // save data from file in vector
   std::vector<std::string> inputs_from_file;
   std::string new_input_line;

   while( getline(input_file_ifstream, new_input_line) ) {
      // remove '\r' EOL chars and tab characters
      new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\r'), new_input_line.end());
      new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\t'), new_input_line.end());
      // add line to vector of string
      inputs_from_file.push_back(new_input_line);
   }

   // close istream
   input_file_ifstream.close();


   // size of user sets (string length)
   const int N = strsize; 

   std::list<int> usersets; // all user sets converted to bitsets
   for (std::string s : inputs_from_file) usersets.push_back(to_set(s));

   std::vector<int> counts; // one counter per subset
   int b = 0; // number of K subsets of an N set (the N over K binominal)
   for (int s=start(K); s<limit(N); s=next(s)) ++b;
   std::cout << "*** binominal = " << b << std::endl;
   counts.resize(b);

   bool quit = false;
   do {
      for (int& c : counts) c = 0; // reset counters

      // count all user sets for every subset 
      for (int s=start(K), j=0; s<limit(N); s=next(s), ++j) {
         for (int set : usersets) {
            if ((set & s) == s) ++counts[j];
         }
      }

      // find max counter (biggest run)
      int runSet = 0;
      for (int s=start(K), j=0, m=0; s<limit(N); s=next(s), ++j) {
         if (counts[j] > m) {
            m = counts[j];
            runSet = s;
         }
      }

      // print all user sets belonging to this run
      if (runSet > 0) {
         // gather output sets
         std::vector<int> out;
         for (auto sp = usersets.begin(); sp != usersets.end();) {
            if ((*sp & runSet) == runSet) {
               out.push_back(*sp);
               sp = usersets.erase(sp); // remove printed user set
            } else ++sp;
         }

         //actual output
         std::sort(out.rbegin(),out.rend());  // from largest to smallest
         std::cout << to_string(runSet) << " *" << std::endl;
         for (int x : out) {
            std::cout << to_string(x) << " : " << x << std::endl;
         }
      } else quit = true; // no more runs

      std::cout << std::endl;

   } while (!quit);

   // print all remaining user sets
   if (usersets.size() > 0) {
      std::cout << "*** remaining sets:" << std::endl;
      for (int x : usersets) std::cout << to_string(x) << std::endl;
   }

// main EOF
}

This was compiled an built with g++ 4.8 using,
g++ -O2 -c -std=c++11 sort3_02.cpp
g++ -std=c++11 -o sort3_02.exe sort3_02.o

The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.

I am going to get this learning on the data set I have now and then work on making the subset size look up recursive, meaning the for 8 on bits, the sorting will start with subset size of 7. All strings that are not part of a run of at least 2 will be re-analyzed with a subset size of 6, then 5 etc. This will repeat until all strings have been assigned to a run, or there are only two unassigned strings remaining.

LMHmedchem

**razzle** · April 4th, 2014, 02:09 AM

Originally Posted by LMHmedchem

The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.

Just for the sake of it I've improved on the algorithm somewhat. It's based on the observation that probably not all subsets are actually used. So I remove the subsets that are not present in any 18-bit string and store the rest in a vector called subsets. All subsets are scanned just once at the beginning (using the full gosper loop). After that the (hopefully fewer) subsets in the subsets vector are used.

And indeed, with the original test strings the reduction is from 816 to 123 subsets. That should translate to almost 10 times faster code. The optimization will depend on the nature of the 18-bit strings so nothing is guaranteed but at least it will not slow down the program.

Also the memory requirements have been reduced. The biggest memory consumer was the counts vector of size 816. After the optimization there are two vectors namely counts and subsets of the same size 123. It's a decrease from 812 before to 246 after the optimization. But if unfortunately no subset reduction took place the memory requirements will instead double, increasing from 816 to 1632. That's the worst case cost of the optimization.

The optimization is limited to one section of my original code. If you want to use it just replace the old section with the new section below. Good luck with your research.

Changed code section:

Code:

// ---
	std::vector<int> counts; // one counter per subset
	std::vector<int> subsets; // sequence of actually used subsets
	int b = 0; // number of K subsets of an N set (the N over K binominal)
	for (int s=start(K); s<limit(N); s=next(s)) { // full subset (gosper) loop
		for (int set : usersets) {
			if ((set & s) == s) {
				subsets.push_back(s);  // subset is in use
				break;
			}
		}
		++b;
	}
	counts.resize(subsets.size());

	std::cout << "*** all subsets (the binominal) = " << b << std::endl;
	std::cout << "*** subsets in use = " << subsets.size() << std::endl;

	bool quit = false;
	do {
		for (int& c : counts) c = 0; // reset counters

			// count all user sets for every subset 
		for (int j=0, J=subsets.size(); j<J; ++j) { // reduced subset loop
			int s = subsets[j];
			int& c = counts[j];
			for (int set : usersets) {
				if ((set & s) == s) ++c; // increment subset counter
			}
		}

			// find max counter (biggest run)
		int runSet = 0;
		for (int j=0, J=subsets.size(), m=0; j<J; ++j) {
			if (counts[j] > m) {
				m = counts[j];
				runSet = subsets[j];
			}
		}
// ---

Thread: question about sorting on bitstrings

Thread Tools

Display

Re: question about sorting on bitstrings

Re: question about sorting on bitstrings

Posting Permissions