-
April 3rd, 2014, 04:20 PM
#16
Re: question about sorting on bitstrings
I have made a few modifications to make the tool more widget like. It now reads the subset size from the command line and takes the data from an input file.
Code:
// file sort3_02.cpp based from code by razzle
// http://forums.codeguru.com/member.php?419897-razzle
#include <cstdlib>
#include <list>
#include <vector>
#include <string>
#include <bitset>
#include <iostream>
#include <algorithm>
#include <unistd.h>
#include <fstream>
const int strsize = 18;
inline int next(int x) { // next K subset of an N bitset (gosper's hack (hakmem 175))
int s = x & -x;
int r = x + s;
return r | (((x^r)>>2)/s);
}
inline int start(int k) { // first subset
return (1<<k) - 1;
}
inline int limit(int n) { // first non-subset
return 1<<n;
}
int to_set(const std::string& s) { // string to bitset
return int(std::bitset<strsize>(s).to_ulong());
}
std::string to_string(int x) { // bitset to string
return std::bitset<strsize>(x).to_string();
}
int main(int argc, char **argv) {
// declarations for args-----------------------------------------------
// command line arguments
int cmd_arguments;
// subset size
int K = 0;
std::string str_K;
// input file read to data[]
std::string input_file;
// Parse command line arguments
while ((cmd_arguments = getopt (argc, argv, "s:i:h:")) != -1)
switch (cmd_arguments)
{
case 's':
str_K = optarg;
K = atoi(str_K.c_str());
break;
case 'i':
input_file = optarg;
break;
case 'h':
// print help blurb
std::cout << std::endl;
std::cout << "use argument -i to specify input file" << std::endl;
std::cout << "use argument -s to specify subset size" << std::endl;
std::cout << std::endl;
exit(0);
break;
}
// check args
if(input_file.length() == 0) {
std::cerr << " -> no input file was specified, use -i" << std::endl;
exit(-1);
}
if(str_K.length() == 0) {
std::cerr << " -> subset size was specified, use -s" << std::endl;
exit(-2);
}
//---------------------------------------------------------------------
// create an input stream and open the input_file file
std::ifstream input_file_ifstream;
input_file_ifstream.open( input_file.c_str() );
// check if input_file was opened
if(!input_file_ifstream.is_open()) {
std::cerr << "file " << input_file << " could not be opened" << std::endl;
exit(-3);
}
// save data from file in vector
std::vector<std::string> inputs_from_file;
std::string new_input_line;
while( getline(input_file_ifstream, new_input_line) ) {
// remove '\r' EOL chars and tab characters
new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\r'), new_input_line.end());
new_input_line.erase (remove(new_input_line.begin(),new_input_line.end(),'\t'), new_input_line.end());
// add line to vector of string
inputs_from_file.push_back(new_input_line);
}
// close istream
input_file_ifstream.close();
// size of user sets (string length)
const int N = strsize;
std::list<int> usersets; // all user sets converted to bitsets
for (std::string s : inputs_from_file) usersets.push_back(to_set(s));
std::vector<int> counts; // one counter per subset
int b = 0; // number of K subsets of an N set (the N over K binominal)
for (int s=start(K); s<limit(N); s=next(s)) ++b;
std::cout << "*** binominal = " << b << std::endl;
counts.resize(b);
bool quit = false;
do {
for (int& c : counts) c = 0; // reset counters
// count all user sets for every subset
for (int s=start(K), j=0; s<limit(N); s=next(s), ++j) {
for (int set : usersets) {
if ((set & s) == s) ++counts[j];
}
}
// find max counter (biggest run)
int runSet = 0;
for (int s=start(K), j=0, m=0; s<limit(N); s=next(s), ++j) {
if (counts[j] > m) {
m = counts[j];
runSet = s;
}
}
// print all user sets belonging to this run
if (runSet > 0) {
// gather output sets
std::vector<int> out;
for (auto sp = usersets.begin(); sp != usersets.end();) {
if ((*sp & runSet) == runSet) {
out.push_back(*sp);
sp = usersets.erase(sp); // remove printed user set
} else ++sp;
}
//actual output
std::sort(out.rbegin(),out.rend()); // from largest to smallest
std::cout << to_string(runSet) << " *" << std::endl;
for (int x : out) {
std::cout << to_string(x) << " : " << x << std::endl;
}
} else quit = true; // no more runs
std::cout << std::endl;
} while (!quit);
// print all remaining user sets
if (usersets.size() > 0) {
std::cout << "*** remaining sets:" << std::endl;
for (int x : usersets) std::cout << to_string(x) << std::endl;
}
// main EOF
}
This was compiled an built with g++ 4.8 using,
g++ -O2 -c -std=c++11 sort3_02.cpp
g++ -std=c++11 -o sort3_02.exe sort3_02.o
The code is quite fast (instantaneous) for all of the examples I have tried, even up to subset size of 9/18.
I am going to get this learning on the data set I have now and then work on making the subset size look up recursive, meaning the for 8 on bits, the sorting will start with subset size of 7. All strings that are not part of a run of at least 2 will be re-analyzed with a subset size of 6, then 5 etc. This will repeat until all strings have been assigned to a run, or there are only two unassigned strings remaining.
LMHmedchem
Posting Permissions
- You may not post new threads
- You may not post replies
- You may not post attachments
- You may not edit your posts
-
Forum Rules
|
Click Here to Expand Forum to Full Width
|