Commit 296f5ac0 by Kim, Jungwon

minor

parent 11d3e256
#ifndef PAPYRUS_KV_DEBUG_H
#define PAPYRUS_KV_DEBUG_H
#include <stdio.h>
//#define _TRACE_ENABLE
#define _CHECK_ENABLE
#define _DEBUG_ENABLE
#define _INFO_ENABLE
#define _ERROR_ENABLE
#define _TODO_ENABLE
#define _COLOR_DEBUG
#ifdef _COLOR_DEBUG
#define RED "\033[22;31m"
#define GREEN "\033[22;32m"
#define YELLOW "\033[22;33m"
#define BLUE "\033[22;34m"
#define PURPLE "\033[22;35m"
#define CYAN "\033[22;36m"
#define GRAY "\033[22;37m"
#define _RED "\033[22;41m"
#define _GREEN "\033[22;42m"
#define _YELLOW "\033[22;43m"
#define _BLUE "\033[22;44m"
#define _PURPLE "\033[22;45m"
#define _CYAN "\033[22;46m"
#define _GRAY "\033[22;47m"
#define RESET "\e[m"
#else
#define RED
#define GREEN
#define YELLOW
#define BLUE
#define PURPLE
#define CYAN
#define GRAY
#define _RED
#define _GREEN
#define _YELLOW
#define _BLUE
#define _PURPLE
#define _CYAN
#define _GRAY
#define RESET
#endif
extern char nick_[];
#ifdef _TRACE_ENABLE
#define _trace(fmt, ...) { printf( BLUE "[T] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#define __trace(fmt, ...) { printf(_BLUE "[T] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#else
#define _trace(fmt, ...)
#define __trace(fmt, ...)
#endif
#ifdef _CHECK_ENABLE
#define _check() { printf( PURPLE "[C] %s [%s:%d:%s]" RESET "\n", nick_, __FILE__, __LINE__, __func__); fflush(stdout); }
#define __check() { printf(_PURPLE "[C] %s [%s:%d:%s]" RESET "\n", nick_, __FILE__, __LINE__, __func__); fflush(stdout); }
#else
#define _check()
#define __check()
#endif
#ifdef _DEBUG_ENABLE
#define _debug(fmt, ...) { printf( CYAN "[D] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#define __debug(fmt, ...) { printf(_CYAN "[D] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#else
#define _debug(fmt, ...)
#define __debug(fmt, ...)
#endif
#ifdef _INFO_ENABLE
#define _info(fmt, ...) { printf( YELLOW "[I] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#define __info(fmt, ...) { printf(_YELLOW "[I] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#else
#define _info(fmt, ...)
#define __info(fmt, ...)
#endif
#ifdef _ERROR_ENABLE
#define _error(fmt, ...) { printf( RED "[E] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#define __error(fmt, ...) { printf(_RED "[E] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#else
#define _error(fmt, ...)
#define __error(fmt, ...)
#endif
#ifdef _TODO_ENABLE
#define _todo(fmt, ...) { printf( GREEN "[TODO] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#define __todo(fmt, ...) { printf(_GREEN "[TODO] %s [%s:%d:%s] " fmt RESET "\n", nick_, __FILE__, __LINE__, __func__, __VA_ARGS__); fflush(stdout); }
#else
#define _todo(fmt, ...)
#define __todo(fmt, ...)
#endif
#endif /* PAPYRUS_KV_DEBUG_H */
/****************************************************************/
/* Parallel Combinatorial BLAS Library (for Graph Computations) */
/* version 1.2 -------------------------------------------------*/
/* date: 10/06/2011 --------------------------------------------*/
/* authors: Aydin Buluc (abuluc@lbl.gov), Adam Lugowski --------*/
/****************************************************************/
/*
Copyright (c) 2011, Aydin Buluc
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef _DELETER_H_
#define _DELETER_H_
#include <iostream>
using namespace std;
struct DeletePtrIf
{
template<typename T, typename _BinaryPredicate, typename Pred>
void operator()(const T *ptr, _BinaryPredicate cond, Pred first, Pred second) const
{
if(cond(first,second))
delete ptr;
}
};
template<typename A>
void DeleteAll(A arr1)
{
delete [] arr1;
}
template<typename A, typename B>
void DeleteAll(A arr1, B arr2)
{
delete [] arr2;
DeleteAll(arr1);
}
template<typename A, typename B, typename C>
void DeleteAll(A arr1, B arr2, C arr3)
{
delete [] arr3;
DeleteAll(arr1, arr2);
}
template<typename A, typename B, typename C, typename D>
void DeleteAll(A arr1, B arr2, C arr3, D arr4)
{
delete [] arr4;
DeleteAll(arr1, arr2, arr3);
}
template<typename A, typename B, typename C, typename D, typename E>
void DeleteAll(A arr1, B arr2, C arr3, D arr4, E arr5)
{
delete [] arr5;
DeleteAll(arr1, arr2, arr3, arr4);
}
template<typename A, typename B, typename C, typename D, typename E, typename F>
void DeleteAll(A arr1, B arr2, C arr3, D arr4, E arr5, F arr6)
{
delete [] arr6;
DeleteAll(arr1, arr2, arr3, arr4,arr5);
}
#endif
#ifndef _FRIENDS_H_
#define _FRIENDS_H_
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <fstream>
#include <sstream>
#include "Kmer.hpp"
#include "KmerIterator.hpp"
#include "Deleter.h"
#include <sys/stat.h>
using namespace std;
#ifndef MAX_KMER_SIZE
#define MAX_KMER_SIZE 64
#endif
#define KMERLONGS MAX_KMER_SIZE/32 // 32 = numbits(uint64_t)/2- with 2 being the number of bits needed per nucleotide
typedef array<uint64_t, KMERLONGS> MERARR;
struct filedata
{
char filename[256];
size_t filesize;
};
ostream & operator<<(ostream & os, uint8_t val)
{
return os << static_cast<int>(val);
}
struct kmerpack // the pair<MERARR,int> used as value_type in map is not guaranteed to be contiguous in memory
{
MERARR arr;
int count;
bool operator > (const kmerpack & rhs) const
{ return (arr > rhs.arr); }
bool operator < (const kmerpack & rhs) const
{ return (arr < rhs.arr); }
bool operator == (const kmerpack & rhs) const
{ return (arr == rhs.arr); }
};
struct ufxpack // 38bytes for k=51
{
MERARR arr; // ~128-bits=16bytes for k=51
int count;
char left;
char right;
int leftmin;
int leftmax;
int rightmin;
int rightmax;
bool operator > (const ufxpack & rhs) const
{ return (arr > rhs.arr); }
bool operator < (const ufxpack & rhs) const
{ return (arr < rhs.arr); }
bool operator == (const ufxpack & rhs) const
{ return (arr == rhs.arr); }
};
void PackIntoUFX(array<int,4> & leftcnt, array<int,4> & righcnt, int count, ufxpack & pack)
{
pair<int, char> lsort[4] = {make_pair(leftcnt[0], 'A'), make_pair(leftcnt[1], 'C'), make_pair(leftcnt[2], 'G'), make_pair(leftcnt[3], 'T')};
pair<int, char> rsort[4] = {make_pair(righcnt[0], 'A'), make_pair(righcnt[1], 'C'), make_pair(righcnt[2], 'G'), make_pair(righcnt[3], 'T')};
sort(lsort, lsort+4);
sort(rsort, rsort+4);
pack.left = lsort[3].second; // max entry guarenteed to exist
pack.leftmax = lsort[3].first;
pack.leftmin = lsort[2].first;
pack.right = rsort[3].second;
pack.rightmax = rsort[3].first;
pack.rightmin = rsort[2].first;
pack.count = count;
}
struct SNPdata
{
MERARR karr;
char extA;
char extB;
bool operator > (const SNPdata & rhs) const
{ return (karr > rhs.karr); }
bool operator < (const SNPdata & rhs) const
{ return (karr < rhs.karr); }
bool operator == (const SNPdata & rhs) const
{ return (karr == rhs.karr); }
};
#endif
#ifndef BFG_KMER_HPP
#define BFG_KMER_HPP
#ifndef MAX_KMER_SIZE
#define MAX_KMER_SIZE 64 // ABAB: This code will probably crush if this is not a multiple of 32
#endif
#include <stdio.h>
#include <stdint.h>
#include <cassert>
#include <cstring>
#include <string>
#include <array>
#include "hash.hpp"
/* Short description:
* - Store kmer strings by using 2 bits per base instead of 8
* - Easily return reverse complements of kmers, e.g. TTGG -> CCAA
* - Easily compare kmers
* - Provide hash of kmers
* - Get last and next kmer, e.g. ACGT -> CGTT or ACGT -> AACGT
* */
class Kmer {
public:
Kmer();
Kmer(const Kmer& o);
explicit Kmer(const char *s);
void copyDataFrom(uint8_t * mybytes) // this is like a shadow constructor (to avoid accidental signature match with the existing constructor)
{
memcpy(longs, mybytes, sizeof(uint64_t) * (MAX_K/32));
}
explicit Kmer(const std::array<uint64_t, MAX_KMER_SIZE/32> & arr)
{
std::memcpy (longs, arr.data(), sizeof(uint64_t) * (MAX_K/32));
}
Kmer& operator=(const Kmer& o);
void set_deleted();
bool operator<(const Kmer& o) const;
bool operator==(const Kmer& o) const;
bool operator!=(const Kmer& o) const {
return !(*this == o);
}
void set_kmer(const char *s);
uint64_t hash() const;
Kmer twin() const;
Kmer rep() const; // ABAB: return the smaller of itself (lexicographically) or its reversed-complement (i.e. twin)
Kmer getLink(const size_t index) const;
Kmer forwardBase(const char b) const;
Kmer backwardBase(const char b) const;
std::string getBinary() const;
void toString(char * s) const;
std::string toString() const;
void copyDataInto(void * pointer) const
{
// void * memcpy ( void * destination, const void * source, size_t num );
memcpy(pointer, longs, sizeof(uint64_t) * (MAX_K/32));
}
// ABAB: return the raw data packed in an std::array
// this preserves the lexicographical order on k-mers
// i.e. A.toString() < B.toString <=> A.getArray() < B.getArray()
std::array<uint64_t, MAX_KMER_SIZE/32> getArray()
{
std::array<uint64_t,MAX_K/32> i64array;
std::memcpy (i64array.data(),longs,sizeof(uint64_t) * (MAX_K/32));
return i64array;
}
bool equalUpToLastBase(const Kmer & rhs); // returns true for completely identical k-mers as well as k-mers that only differ at the last base
// static functions
static void set_k(unsigned int _k);
static constexpr size_t numBytes() {
return (sizeof(uint64_t) * (MAX_K/32));
}
static const unsigned int MAX_K = MAX_KMER_SIZE;
static unsigned int k;
private:
static unsigned int k_bytes;
static unsigned int k_longs;
static unsigned int k_modmask; // int?
// data fields
union {
uint8_t bytes[MAX_K/4];
uint64_t longs[MAX_K/32];
};
// Unions are very useful for low-level programming tasks that involve writing to the same memory area
// but at different portions of the allocated memory space, for instance:
// union item {
// // The item is 16-bits
// short theItem;
// // In little-endian lo accesses the low 8-bits -
// // hi, the upper 8-bits
// struct { char lo; char hi; } portions;
// };
// item tItem;
// tItem.theItem = 0xBEAD;
// tItem.portions.lo = 0xEF; // The item now equals 0xBEEF
// void shiftForward(int shift);
// void shiftBackward(int shift);
};
struct KmerHash {
size_t operator()(const Kmer &km) const {
return km.hash();
}
};
/*
namespace std { // to use in unordered_map
template <>
struct hash<Kmer>
{
uint64_t operator()(const Kmer& k) const
{
return k.hash();
}
};
}*/
#endif // BFG_KMER_HPP
#include <iterator>
#include <utility>
#include "Kmer.hpp"
#include "KmerIterator.hpp"
/* Note: That an iter is exhausted means that (iter._invalid == true) */
// use: ++iter;
// pre:
// post: *iter is now exhausted
// OR *iter is the next valid pair of kmer and location
KmerIterator& KmerIterator::operator++() {
int pos_ = p_.second;
if (!invalid_) {
if (s_[pos_+Kmer::k] == 0) {
invalid_ = true;
return *this;
} else {
find_next(pos_,pos_+Kmer::k-1,true);
return *this;
}
}
return *this;
}
// use: iter++;
// pre:
// post: iter has been incremented by one
KmerIterator KmerIterator::operator++(int) {
KmerIterator tmp(*this);
operator++();
return tmp;
}
// use: val = (a == b);
// pre:
// post: (val == true) if a and b are both exhausted
// OR a and b are in the same location of the same string.
// (val == false) otherwise.
bool KmerIterator::operator==(const KmerIterator& o) {
if (invalid_ || o.invalid_) {
return invalid_ && o.invalid_;
} else {
return (s_ == o.s_) && (p_.second == o.p_.second);
}
}
// use: p = *iter;
// pre:
// post: p is NULL or a pair of Kmer and int
std::pair<Kmer, int>& KmerIterator::operator*() {
return p_;
}
// use: example 1: km = iter->first;
// example 2: i = iter->second;
// pre: *iter is not NULL
// post: km will be (*iter).first, i will be (*iter).second
std::pair<Kmer, int>* KmerIterator::operator->() {
return &(operator*());
}
// use: iter.raise(km, rep);
// post: iter has been incremented by one
// if iter is not invalid, km is iter->first and rep is km.rep()
void KmerIterator::raise(Kmer &km, Kmer &rep) {
operator++();
if (!invalid_) {
km = p_.first;
rep = km.rep();
}
}
// use: find_next(i,j, last_valid);
// pre:
// post: *iter is either invalid or is a pair of:
// 1) the next valid kmer in the string that does not have any 'N'
// 2) the location of that kmer in the string
void KmerIterator::find_next(size_t i, size_t j, bool last_valid) {
++i;
++j;
while (s_[j] != 0) {
char c = s_[j];
if (c == 'A' || c == 'C' || c == 'G' || c == 'T') {
if (last_valid) {
p_.first = p_.first.forwardBase(c);
break; // default case,
} else {
if (i + Kmer::k - 1 == j) {
p_.first = Kmer(s_+i);
last_valid = true;
break; // create k-mer from scratch
} else {
++j;
}
}
} else {
++j;
i = j;
last_valid = false;
}
}
if (i+Kmer::k-1 == j && s_[j] != 0) {
p_.second = i;
} else {
invalid_ = true;
}
}
#ifndef BFG_KMER_ITERATOR_HPP
#define BFG_KMER_ITERATOR_HPP
#include <iterator>
#include "Kmer.hpp"
/* Short description:
* - Easily iterate through kmers in a read
* - If the read contains any N, then the N is skipped and checked whether
* there is a kmer to the right of the N
* - iter->first gives the kmer, iter->second gives the position within the reads
* */
class KmerIterator : public std::iterator<std::input_iterator_tag, std::pair<Kmer, int>, int> {
public:
KmerIterator() : s_(NULL), p_(), invalid_(true) {}
KmerIterator(const char* s) : s_(s), p_(), invalid_(false) { find_next(-1,-1,false);}
KmerIterator(const KmerIterator& o) : s_(o.s_), p_(o.p_), invalid_(o.invalid_) {}
KmerIterator& operator++();
KmerIterator operator++(int);
void raise(Kmer &km, Kmer &rep);
bool operator==(const KmerIterator& o);
bool operator!=(const KmerIterator& o) { return !this->operator==(o);}
std::pair<Kmer, int>& operator*();
std::pair<Kmer, int>* operator->();
private:
void find_next(size_t i, size_t j, bool last_valid);
const char *s_;
std::pair<Kmer, int> p_;
bool invalid_;
};
#endif // BFG_KMER_ITERATOR_HPP
#ifndef BFG_KMER_MIDDLE_HPP
#define BFG_KMER_MIDDLE_HPP
#ifndef MAX_KMER_SIZE
#define MAX_KMER_SIZE 64 // ABAB: This code will probably crush if this is not a multiple of 32
#endif
#include <stdio.h>
#include <stdint.h>
#include <cassert>
#include <cstring>
#include <string>
#include <array>
#include "hash.hpp"
/* Short description:
* ABAB: This class is a workaround to the static members of the k-mer class
* Works like an alternative k-mer class as needed
* - Store kmer strings by using 2 bits per base instead of 8
* - Easily return reverse complements of kmers, e.g. TTGG -> CCAA
* - Easily compare kmers
* - Provide hash of kmers
* - Get last and next kmer, e.g. ACGT -> CGTT or ACGT -> AACGT
* */
class KmerMiddle {
public:
KmerMiddle();
KmerMiddle(const KmerMiddle& o);
explicit KmerMiddle(const char *s);
void copyDataFrom(uint8_t * mybytes) // this is like a shadow constructor (to avoid accidental signature match with the existing constructor)
{
memcpy(longs, mybytes, sizeof(uint64_t) * (MAX_K/32));
}
explicit KmerMiddle(const std::array<uint64_t, MAX_KMER_SIZE/32> & arr)
{
std::memcpy (longs, arr.data(), sizeof(uint64_t) * (MAX_K/32));
}
KmerMiddle& operator=(const KmerMiddle& o);
void set_deleted();
bool operator<(const KmerMiddle& o) const;
bool operator==(const KmerMiddle& o) const;
bool operator!=(const KmerMiddle& o) const {
return !(*this == o);
}
void set_kmer(const char *s);
uint64_t hash() const;
KmerMiddle twin() const;
KmerMiddle rep() const; // ABAB: return the smaller of itself (lexicographically) or its reversed-complement (i.e. twin)
KmerMiddle getLink(const size_t index) const;
KmerMiddle forwardBase(const char b) const;
KmerMiddle backwardBase(const char b) const;
std::string getBinary() const;
void toString(char * s) const;
std::string toString() const;
void copyDataInto(void * pointer) const
{
// void * memcpy ( void * destination, const void * source, size_t num );
memcpy(pointer, longs, sizeof(uint64_t) * (MAX_K/32));
}
// ABAB: return the raw data packed in an std::array
// this preserves the lexicographical order on k-mers
// i.e. A.toString() < B.toString <=> A.getArray() < B.getArray()
std::array<uint64_t, MAX_KMER_SIZE/32> getArray()
{
std::array<uint64_t,MAX_K/32> i64array;
std::memcpy (i64array.data(),longs,sizeof(uint64_t) * (MAX_K/32));
return i64array;
}
bool equalUpToLastBase(const KmerMiddle & rhs); // returns true for completely identical k-mers as well as k-mers that only differ at the last base
// static functions
static void set_k(unsigned int _k);
static constexpr size_t numBytes() {
return (sizeof(uint64_t) * (MAX_K/32));
}
static const unsigned int MAX_K = MAX_KMER_SIZE;
static unsigned int k;
private:
static unsigned int k_bytes;
static unsigned int k_longs;
static unsigned int k_modmask; // int?
// data fields
union {
uint8_t bytes[MAX_K/4];
uint64_t longs[MAX_K/32];
};
// Unions are very useful for low-level programming tasks that involve writing to the same memory area
// but at different portions of the allocated memory space, for instance:
// union item {
// // The item is 16-bits
// short theItem;
// // In little-endian lo accesses the low 8-bits -
// // hi, the upper 8-bits
// struct { char lo; char hi; } portions;
// };
// item tItem;
// tItem.theItem = 0xBEAD;
// tItem.portions.lo = 0xEF; // The item now equals 0xBEEF
// void shiftForward(int shift);
// void shiftBackward(int shift);
};
struct KmerMiddleHash {
size_t operator()(const KmerMiddle &km) const {
return km.hash();
}
};
/*
namespace std { // to use in unordered_map