question about sorting on bitstrings

I have made a few modifications to make the tool more widget like. It now reads the subset size from the command line and takes the data from an input file.

Code:

// file sort3_02.cpp based from code by razzle // http://forums.codeguru.com/member.php?419897-razzle #include <cstdlib> #include <list> #include <vector> #include <string> #include <bitset> #include <iostream> #include <algorithm> #include <unistd.h> #include <fstream> const int strsize = 18; inline int next(int x) { // next K subset of an N bitset (gosper's hack (hakmem 175)) int s = x & -x; int r = x + s; return r | (((x^r)>>2)/s); } inline int start(int k) { // first subset return (1<<k) - 1; } inline int limit(int n) { // first non-subset return 1<<n; } int to_set(const std::string& s) { // string to bitset return int(std::bitset<strsize>(s).to_ulong()); } std::string to_string(int x) { // bitset to string return std::bitset<strsize>(x).to_string(); } int main(int argc, char **argv) { // declarations for args----------------------------------------------- // command line arguments int cmd_arguments; // subset size int K = 0; std::string str_K; // input file read to data[] std::string input_file; // Parse command line arguments while ((cmd_arguments = getopt (argc, argv, "s:i:h:")) != -1) switch (cmd_arguments) { case 's': str_K = optarg; K = atoi(str_K.c_str()); break; case 'i': input_file = optarg; break; case 'h': // print help blurb std::cout << std::endl; std::cout << "use argument -i to specify input file" << std::endl; std::cout << "use argument -s to specify subset size" << std::endl; std::cout << std::endl; exit(0); break; } // check args if(input_file.length() == 0) { std::cerr << " -> no input file was specified, use -i" << std::endl; exit(-1); } if(str_K.length() == 0) { std::cerr << " -> subset size was specified, use -s" << std::endl; exit(-2); } //--------------------------------------------------------------------- // create an input stream and open the input_file file std::ifstream input_file_ifstream; input_file_ifstream.open( input_file.c_str() ); // check if input_file was opened if(!input_file_ifstream.is_open()) { std::cerr << "file " << input_file << " could not be opened" << std::endl; exit(-3); } // save data from file in vector std::vector<std::string> inputs_from_file; std::string new_input_line; while( getline(input_file_ifstream, new_input_line) ) { // remove '\r' EOL chars and tab characters new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\r'), new_input_line.end()); new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\t'), new_input_line.end()); // add line to vector of string inputs_from_file.push_back(new_input_line); } // close istream input_file_ifstream.close(); // size of user sets (string length) const int N = strsize; std::list<int> usersets; // all user sets converted to bitsets for (std::string s : inputs_from_file) usersets.push_back(to_set(s)); std::vector<int> counts; // one counter per subset int b = 0; // number of K subsets of an N set (the N over K binominal) for (int s=start(K); s<limit(N); s=next(s)) ++b; std::cout << "*** binominal = " << b << std::endl; counts.resize(b); bool quit = false; do { for (int& c : counts) c = 0; // reset counters // count all user sets for every subset for (int s=start(K), j=0; s<limit(N); s=next(s), ++j) { for (int set : usersets) { if ((set & s) == s) ++counts[j]; } } // find max counter (biggest run) int runSet = 0; for (int s=start(K), j=0, m=0; s<limit(N); s=next(s), ++j) { if (counts[j] > m) { m = counts[j]; runSet = s; } } // print all user sets belonging to this run if (runSet > 0) { // gather output sets std::vector<int> out; for (auto sp = usersets.begin(); sp != usersets.end();) { if ((*sp & runSet) == runSet) { out.push_back(*sp); sp = usersets.erase(sp); // remove printed user set } else ++sp; } //actual output std::sort(out.rbegin(),out.rend()); // from largest to smallest std::cout << to_string(runSet) << " *" << std::endl; for (int x : out) { std::cout << to_string(x) << " : " << x << std::endl; } } else quit = true; // no more runs std::cout << std::endl; } while (!quit); // print all remaining user sets if (usersets.size() > 0) { std::cout << "*** remaining sets:" << std::endl; for (int x : usersets) std::cout << to_string(x) << std::endl; } // main EOF }

This was compiled an built with g++ 4.8 using,
g++ -O2 -c -std=c++11 sort3_02.cpp
g++ -std=c++11 -o sort3_02.exe sort3_02.o

The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.

I am going to get this learning on the data set I have now and then work on making the subset size look up recursive, meaning the for 8 on bits, the sorting will start with subset size of 7. All strings that are not part of a run of at least 2 will be re-analyzed with a subset size of 6, then 5 etc. This will repeat until all strings have been assigned to a run, or there are only two unassigned strings remaining.

LMHmedchem