Re: question about sorting on bitstrings
I have made a few modifications to make the tool more widget like. It now reads the subset size from the command line and takes the data from an input file.
Code:
// file sort3_02.cpp based from code by razzle
// http://forums.codeguru.com/member.php?419897-razzle
#include <cstdlib>
#include <list>
#include <vector>
#include <string>
#include <bitset>
#include <iostream>
#include <algorithm>
#include <unistd.h>
#include <fstream>
const int strsize = 18;
inline int next(int x) { // next K subset of an N bitset (gosper's hack (hakmem 175))
int s = x & -x;
int r = x + s;
return r | (((x^r)>>2)/s);
}
inline int start(int k) { // first subset
return (1<<k) - 1;
}
inline int limit(int n) { // first non-subset
return 1<<n;
}
int to_set(const std::string& s) { // string to bitset
return int(std::bitset<strsize>(s).to_ulong());
}
std::string to_string(int x) { // bitset to string
return std::bitset<strsize>(x).to_string();
}
int main(int argc, char **argv) {
// declarations for args-----------------------------------------------
// command line arguments
int cmd_arguments;
// subset size
int K = 0;
std::string str_K;
// input file read to data[]
std::string input_file;
// Parse command line arguments
while ((cmd_arguments = getopt (argc, argv, "s:i:h:")) != -1)
switch (cmd_arguments)
{
case 's':
str_K = optarg;
K = atoi(str_K.c_str());
break;
case 'i':
input_file = optarg;
break;
case 'h':
// print help blurb
std::cout << std::endl;
std::cout << "use argument -i to specify input file" << std::endl;
std::cout << "use argument -s to specify subset size" << std::endl;
std::cout << std::endl;
exit(0);
break;
}
// check args
if(input_file.length() == 0) {
std::cerr << " -> no input file was specified, use -i" << std::endl;
exit(-1);
}
if(str_K.length() == 0) {
std::cerr << " -> subset size was specified, use -s" << std::endl;
exit(-2);
}
//---------------------------------------------------------------------
// create an input stream and open the input_file file
std::ifstream input_file_ifstream;
input_file_ifstream.open( input_file.c_str() );
// check if input_file was opened
if(!input_file_ifstream.is_open()) {
std::cerr << "file " << input_file << " could not be opened" << std::endl;
exit(-3);
}
// save data from file in vector
std::vector<std::string> inputs_from_file;
std::string new_input_line;
while( getline(input_file_ifstream, new_input_line) ) {
// remove '\r' EOL chars and tab characters
new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\r'), new_input_line.end());
new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\t'), new_input_line.end());
// add line to vector of string
inputs_from_file.push_back(new_input_line);
}
// close istream
input_file_ifstream.close();
// size of user sets (string length)
const int N = strsize;
std::list<int> usersets; // all user sets converted to bitsets
for (std::string s : inputs_from_file) usersets.push_back(to_set(s));
std::vector<int> counts; // one counter per subset
int b = 0; // number of K subsets of an N set (the N over K binominal)
for (int s=start(K); s<limit(N); s=next(s)) ++b;
std::cout << "*** binominal = " << b << std::endl;
counts.resize(b);
bool quit = false;
do {
for (int& c : counts) c = 0; // reset counters
// count all user sets for every subset
for (int s=start(K), j=0; s<limit(N); s=next(s), ++j) {
for (int set : usersets) {
if ((set & s) == s) ++counts[j];
}
}
// find max counter (biggest run)
int runSet = 0;
for (int s=start(K), j=0, m=0; s<limit(N); s=next(s), ++j) {
if (counts[j] > m) {
m = counts[j];
runSet = s;
}
}
// print all user sets belonging to this run
if (runSet > 0) {
// gather output sets
std::vector<int> out;
for (auto sp = usersets.begin(); sp != usersets.end();) {
if ((*sp & runSet) == runSet) {
out.push_back(*sp);
sp = usersets.erase(sp); // remove printed user set
} else ++sp;
}
//actual output
std::sort(out.rbegin(),out.rend()); // from largest to smallest
std::cout << to_string(runSet) << " *" << std::endl;
for (int x : out) {
std::cout << to_string(x) << " : " << x << std::endl;
}
} else quit = true; // no more runs
std::cout << std::endl;
} while (!quit);
// print all remaining user sets
if (usersets.size() > 0) {
std::cout << "*** remaining sets:" << std::endl;
for (int x : usersets) std::cout << to_string(x) << std::endl;
}
// main EOF
}
This was compiled an built with g++ 4.8 using,
g++ -O2 -c -std=c++11 sort3_02.cpp
g++ -std=c++11 -o sort3_02.exe sort3_02.o
The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.
I am going to get this learning on the data set I have now and then work on making the subset size look up recursive, meaning the for 8 on bits, the sorting will start with subset size of 7. All strings that are not part of a run of at least 2 will be re-analyzed with a subset size of 6, then 5 etc. This will repeat until all strings have been assigned to a run, or there are only two unassigned strings remaining.
LMHmedchem
Re: question about sorting on bitstrings
Quote:
Originally Posted by
LMHmedchem
The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.
Just for the sake of it I've improved on the algorithm somewhat. It's based on the observation that probably not all subsets are actually used. So I remove the subsets that are not present in any 18-bit string and store the rest in a vector called subsets. All subsets are scanned just once at the beginning (using the full gosper loop). After that the (hopefully fewer) subsets in the subsets vector are used.
And indeed, with the original test strings the reduction is from 816 to 123 subsets. That should translate to almost 10 times faster code. The optimization will depend on the nature of the 18-bit strings so nothing is guaranteed but at least it will not slow down the program.
Also the memory requirements have been reduced. The biggest memory consumer was the counts vector of size 816. After the optimization there are two vectors namely counts and subsets of the same size 123. It's a decrease from 812 before to 246 after the optimization. But if unfortunately no subset reduction took place the memory requirements will instead double, increasing from 816 to 1632. That's the worst case cost of the optimization.
The optimization is limited to one section of my original code. If you want to use it just replace the old section with the new section below. Good luck with your research.
Changed code section:
Code:
// ---
std::vector<int> counts; // one counter per subset
std::vector<int> subsets; // sequence of actually used subsets
int b = 0; // number of K subsets of an N set (the N over K binominal)
for (int s=start(K); s<limit(N); s=next(s)) { // full subset (gosper) loop
for (int set : usersets) {
if ((set & s) == s) {
subsets.push_back(s); // subset is in use
break;
}
}
++b;
}
counts.resize(subsets.size());
std::cout << "*** all subsets (the binominal) = " << b << std::endl;
std::cout << "*** subsets in use = " << subsets.size() << std::endl;
bool quit = false;
do {
for (int& c : counts) c = 0; // reset counters
// count all user sets for every subset
for (int j=0, J=subsets.size(); j<J; ++j) { // reduced subset loop
int s = subsets[j];
int& c = counts[j];
for (int set : usersets) {
if ((set & s) == s) ++c; // increment subset counter
}
}
// find max counter (biggest run)
int runSet = 0;
for (int j=0, J=subsets.size(), m=0; j<J; ++j) {
if (counts[j] > m) {
m = counts[j];
runSet = subsets[j];
}
}
// ---