sampleSNPs  0.9
Fast ordered sampling of records from files
varfiles.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 Anthony J. Greenberg
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5  *
6  * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7  *
8  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9  *
10  * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11  *
12  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
13  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
14  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
15  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
16  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
17  * THE POSSIBILITY OF SUCH DAMAGE.
18  */
19 
21 
37 #ifndef varfiles_hpp
38 #define varfiles_hpp
39 
40 #include <fstream>
41 #include <string>
42 #include <vector>
43 #include <unordered_map>
44 #include <cstdint>
45 #include <limits>
46 
47 #include "populations.hpp"
48 
49 using std::fstream;
50 using std::string;
51 using std::vector;
52 using std::unordered_map;
53 using std::numeric_limits;
54 
55 namespace sampFiles {
56  // Forward declarations
57  class VarFile;
58  class GbinFile;
59  class GbinFileI;
60  class GbinFileO;
61  class BedFile;
62  class BedFileI;
63  class BedFileO;
64  class GtxtFile;
65  class GtxtFileI;
66  class GtxtFileO;
67  class TpedFile;
68  class TpedFileI;
69  class TpedFileO;
70  class VcfFile;
71  class VcfFileI;
72  class VcfFileO;
73  class HmpFile;
74  class HmpFileI;
75  class HmpFileO;
76 
82  static const size_t BUF_SIZE = 10485760;
84  const double EPS = numeric_limits<double>::epsilon();
86  const double PI = 3.14159265358979323846264338328;
87 
92  class VarFile {
93  protected:
95  fstream _varFile;
96 
98  VarFile() {_varFile.exceptions(fstream::badbit); };
99 
100  public:
102  VarFile(const VarFile &in) = default;
104  VarFile &operator=(const VarFile &in) = default;
106  VarFile(VarFile &&in) = default;
108  VarFile &operator=(VarFile &&in) = default;
110  ~VarFile(){if (_varFile.is_open()) _varFile.close(); };
111 
113  virtual void open() = 0;
115  virtual void close() = 0;
116  };
117 
123  class GbinFile : public VarFile {
124  protected:
126  string _fileName;
128  size_t _nCols;
130  size_t _elemSize;
131 
132  public:
134  GbinFile() : VarFile(), _nCols{0}, _elemSize{sizeof(char)} {};
144  GbinFile(const string &fileName, const size_t &nCols, const size_t &elemSize) : VarFile(), _fileName{fileName}, _nCols{nCols}, _elemSize{elemSize} {};
145 
147  GbinFile(const GbinFile &in) = default;
149  GbinFile &operator=(const GbinFile &in) = default;
151  GbinFile(GbinFile &&in) = default;
153  GbinFile &operator=(GbinFile &&in) = default;
156 
158  virtual void open() {};
160  virtual void close();
161 
162  };
163 
169  class GbinFileI : GbinFile {
170  protected:
177  virtual uint64_t _numLines();
178 
179  public:
181  GbinFileI() : GbinFile() {};
189  GbinFileI(const string &fileName, const size_t &nCols, const size_t &elemSize) : GbinFile(fileName, nCols, elemSize) {};
191  GbinFileI(const GbinFileI &in) = default;
193  GbinFileI &operator=(const GbinFileI &in) = default;
195  GbinFileI(GbinFileI &&in) = default;
197  GbinFileI &operator=(GbinFileI &&in) = default;
200 
202  void open();
203 
212  void sample(GbinFileO &out, const uint64_t &n);
214  uint64_t nlines() { return _numLines(); };
215 
216  };
217 
223  class GbinFileO : public GbinFile {
224  friend class GbinFileI;
225  protected:
226 
227  public:
229  GbinFileO() : GbinFile() {};
237  GbinFileO(const string &fileName, const size_t &nCols, const size_t &elemSize) : GbinFile(fileName, nCols, elemSize) {};
239  GbinFileO(const GbinFileO &in) = default;
241  GbinFileO &operator=(const GbinFileO &in) = default;
243  GbinFileO(GbinFileO &&in) = default;
245  GbinFileO &operator=(GbinFileO &&in) = default;
248 
250  void open();
251 
252  };
257  class BedFile : public GbinFile {
258  protected:
259 
261  fstream _famFile;
263  fstream _bimFile;
265  string _fileStub;
271  static const vector<char> _masks;
277  static const unordered_map<char, string> _tests;
278 
279 
280  public:
282  BedFile();
287  BedFile(const string &stubName);
289  BedFile(const BedFile &in) = default;
291  BedFile &operator=(const BedFile &in) = default;
293  BedFile(BedFile &&in) = default;
295  BedFile &operator=(BedFile &&in) = default;
297  ~BedFile();
298 
300  virtual void open() {};
302  void close();
303 
304  };
305 
311  class BedFileI : public BedFile {
312  protected:
313 
320  uint64_t _numLines();
327  uint64_t _famLines();
336  uint64_t _famLines(fstream &fam);
337 
353  void _ld(const char *snp1, const char *snp2, const size_t &N, const unsigned short &pad, double &rSq, double &Dprime, double &dcnt1, double &dcnt2);
369  void _ld(const char *snp1, const char *snp2, const PopIndex &popID, vector<double> &rSq, vector<double> &Dprime, vector<double> &dcnt1, vector<double> &dcnt2);
370  public:
372  BedFileI() : BedFile() {};
377  BedFileI(const string &stubName) : BedFile(stubName) {};
379  BedFileI(const BedFileI &in) = default;
381  BedFileI &operator=(const BedFileI &in) = default;
383  BedFileI(BedFileI &&in) = default;
385  BedFileI &operator=(BedFileI &&in) = default;
388 
390  void open();
391 
400  void sample(BedFileO &out, const uint64_t &n);
401 
409  void sampleLD(const uint64_t &n);
418  void sampleLD(const PopIndex &popID, const uint64_t &n);
420  uint64_t nsnp() { return _numLines(); };
422  uint64_t nindiv() { return _famLines(); };
423 
424  };
425 
431  class BedFileO : public BedFile {
432  friend class BedFileI;
433  protected:
434 
435  public:
437  BedFileO() : BedFile() {};
442  BedFileO(const string &stubName) : BedFile(stubName) {};
444  BedFileO(const BedFileO &in) = default;
446  BedFileO &operator=(const BedFileO &in) = default;
448  BedFileO(BedFileO &&in) = default;
450  BedFileO &operator=(BedFileO &&in) = default;
453 
455  void open();
456 
457  };
463  class GtxtFile : public VarFile {
464  protected:
466  string _fileName;
468  bool _head;
469 
470  public:
472  GtxtFile() : VarFile(), _head{false} {};
478  GtxtFile(const string &fileName) : VarFile(), _fileName{fileName}, _head{false} {};
485  GtxtFile(const string &fileName, const bool &head) : VarFile(), _fileName{fileName}, _head{head} {};
486 
488  GtxtFile(const GtxtFile &in) = default;
490  GtxtFile &operator=(const GtxtFile &in) = default;
492  GtxtFile(GtxtFile &&in) = default;
494  GtxtFile &operator=(GtxtFile &&in) = default;
497 
499  virtual void open() {};
501  virtual void close();
502 
503  };
504 
510  class GtxtFileI : GtxtFile {
511  protected:
518  virtual uint64_t _numLines();
519 
520  public:
522  GtxtFileI() : GtxtFile() {};
527  GtxtFileI(const string &fileName) : GtxtFile(fileName) {};
533  GtxtFileI(const string &fileName, const bool &head) : GtxtFile(fileName, head) {};
535  GtxtFileI(const GtxtFileI &in) = default;
537  GtxtFileI &operator=(const GtxtFileI &in) = default;
539  GtxtFileI(GtxtFileI &&in) = default;
541  GtxtFileI &operator=(GtxtFileI &&in) = default;
544 
546  void open();
547 
557  void sample(GtxtFileO &out, const uint64_t &n, const bool &headSkip);
568  void sample(const uint64_t &n, const bool &headSkip, const char &delim, vector<string> &out);
570  uint64_t nlines() { return _numLines(); };
571 
572  };
573 
579  class GtxtFileO : public GtxtFile {
580  friend class GtxtFileI;
581  protected:
582 
583  public:
585  GtxtFileO() : GtxtFile() {};
590  GtxtFileO(const string &fileName) : GtxtFile(fileName) {};
596  GtxtFileO(const string &fileName, const bool &head) : GtxtFile(fileName, head) {};
598  GtxtFileO(const GtxtFileO &in) = default;
600  GtxtFileO &operator=(const GtxtFileO &in) = default;
602  GtxtFileO(GtxtFileO &&in) = default;
604  GtxtFileO &operator=(GtxtFileO &&in) = default;
607 
609  void open();
610 
611  };
612 
613 
618  class TpedFile : public GtxtFile {
619  protected:
621  fstream _tfamFile;
623  string _fileStub;
624 
625  public:
627  TpedFile() : GtxtFile() {_tfamFile.exceptions(fstream::badbit); };
632  TpedFile(const string &stubName) : GtxtFile(stubName + ".tped"), _fileStub{stubName} {_tfamFile.exceptions(fstream::badbit); }; // no headers in .tped
634  TpedFile(const TpedFile &in) = default;
636  TpedFile &operator=(const TpedFile &in) = default;
638  TpedFile(TpedFile &&in) = default;
640  TpedFile &operator=(TpedFile &&in) = default;
642  ~TpedFile();
643 
645  virtual void open() {};
647  void close();
648 
649  };
650 
656  class TpedFileI : public TpedFile {
657  protected:
664  uint64_t _famLines();
673  uint64_t _famLines(fstream &fam);
681  void _famCopy(fstream &fam);
688  uint64_t _numLines();
689 
690  public:
692  TpedFileI() : TpedFile() {};
697  TpedFileI(const string &stubName) : TpedFile(stubName) {};
699  TpedFileI(const TpedFileI &in) = default;
701  TpedFileI &operator=(const TpedFileI &in) = default;
703  TpedFileI(TpedFileI &&in) = default;
705  TpedFileI &operator=(TpedFileI &&in) = default;
708 
710  void open();
711 
720  void sample(TpedFileO &out, const uint64_t &n);
722  uint64_t nsnp() { return _numLines(); };
724  uint64_t nindiv() { return _famLines(); };
725 
726  };
727 
733  class TpedFileO : TpedFile {
734  friend class TpedFileI;
735  protected:
736 
737  public:
739  TpedFileO() : TpedFile() {};
744  TpedFileO(const string &stubName) : TpedFile(stubName) {};
746  TpedFileO(const TpedFileO &in) = default;
748  TpedFileO &operator=(const TpedFileO &in) = default;
750  TpedFileO(TpedFileO &&in) = default;
752  TpedFileO &operator=(TpedFileO &&in) = default;
755 
757  void open();
758  };
759 
765  class VcfFile : public GtxtFile {
766  protected:
767 
768  public:
770  VcfFile() : GtxtFile() {};
776  VcfFile(const string &fileName) : GtxtFile(fileName) {};
777 
779  VcfFile(const VcfFile &in) = default;
781  VcfFile &operator=(const VcfFile &in) = default;
783  VcfFile(VcfFile &&in) = default;
785  VcfFile &operator=(VcfFile &&in) = default;
787  ~VcfFile(){};
788 
790  void open() {};
792  void close();
793  };
794 
800  class VcfFileI : public VcfFile {
801  protected:
808  uint64_t _numLines();
809 
810  public:
812  VcfFileI() : VcfFile() {};
817  VcfFileI(const string &fileName) : VcfFile(fileName) {};
819  VcfFileI(const VcfFileI &in) = default;
821  VcfFileI &operator=(const VcfFileI &in) = default;
823  VcfFileI(VcfFileI &&in) = default;
825  VcfFileI &operator=(VcfFileI &&in) = default;
828 
830  void open();
831 
840  void sample(VcfFileO &out, const uint64_t &n);
842  uint64_t nsnp() { return _numLines(); };
843 
844  };
845 
851  class VcfFileO : public VcfFile {
852  friend class VcfFileI;
853  protected:
854 
855  public:
857  VcfFileO() : VcfFile() {};
862  VcfFileO(const string &fileName) : VcfFile(fileName) {};
864  VcfFileO(const VcfFileO &in) = default;
866  VcfFileO &operator=(const VcfFileO &in) = default;
868  VcfFileO(VcfFileO &&in) = default;
870  VcfFileO &operator=(VcfFileO &&in) = default;
873 
875  void open();
876 
877  };
878 
884  class HmpFile : public GtxtFile {
885  protected:
886 
887  public:
889  HmpFile() : GtxtFile() {};
895  HmpFile(const string &fileName) : GtxtFile(fileName) {};
896 
898  HmpFile(const HmpFile &in) = default;
900  HmpFile &operator=(const HmpFile &in) = default;
902  HmpFile(HmpFile &&in) = default;
904  HmpFile &operator=(HmpFile &&in) = default;
906  ~HmpFile(){};
907 
909  virtual void open() {};
911  virtual void close();
912 
913  };
914 
920  class HmpFileI : HmpFile {
921  protected:
928  uint64_t _numLines();
929 
930  public:
932  HmpFileI() : HmpFile() {};
937  HmpFileI(const string &fileName);
939  HmpFileI(const HmpFileI &in) = default;
941  HmpFileI &operator=(const HmpFileI &in) = default;
943  HmpFileI(HmpFileI &&in) = default;
945  HmpFileI &operator=(HmpFileI &&in) = default;
948 
950  void open();
951 
960  void sample(HmpFileO &out, const uint64_t &n);
962  uint64_t nsnp() { return _numLines(); };
963 
964  };
965 
971  class HmpFileO : public HmpFile {
972  friend class HmpFileI;
973  protected:
974 
975  public:
977  HmpFileO() : HmpFile() {};
982  HmpFileO(const string &fileName) : HmpFile(fileName) {};
984  HmpFileO(const HmpFileO &in) = default;
986  HmpFileO &operator=(const HmpFileO &in) = default;
988  HmpFileO(HmpFileO &&in) = default;
990  HmpFileO &operator=(HmpFileO &&in) = default;
993 
995  void open();
996 
997  };
998 
999 
1000 }
1001 
1002 #endif /* varfiles_hpp */
1003 
1004 
1005 
1006 
~BedFileI()
Destructor.
Definition: varfiles.hpp:387
Definition: populations.hpp:39
VcfFile()
Default constructor.
Definition: varfiles.hpp:770
virtual void open()
Open stream (does nothing)
Definition: varfiles.hpp:300
HmpFile(const string &fileName)
Constructor with file name.
Definition: varfiles.hpp:895
TPED file base class.
Definition: varfiles.hpp:618
size_t _nCols
Number of elements in a row.
Definition: varfiles.hpp:128
uint64_t nsnp()
Number of SNPs in the object.
Definition: varfiles.hpp:962
VarFile()
Default constructor (protected)
Definition: varfiles.hpp:98
GtxtFile(const string &fileName, const bool &head)
Constructor with file name and header indicator.
Definition: varfiles.hpp:485
Connect lines with populations.
fstream _varFile
Variant file stream.
Definition: varfiles.hpp:95
TPED file output class.
Definition: varfiles.hpp:733
TpedFile(const string &stubName)
File name constructor.
Definition: varfiles.hpp:632
virtual void open()
Open stream (does nothing)
Definition: varfiles.hpp:909
~TpedFileO()
Destructor.
Definition: varfiles.hpp:754
GtxtFileO()
Default constructor.
Definition: varfiles.hpp:585
HmpFileI()
Default constructor.
Definition: varfiles.hpp:932
fstream _tfamFile
Corresponding .tfam file stream.
Definition: varfiles.hpp:621
GtxtFileO(const string &fileName, const bool &head)
File name constructor with header specification.
Definition: varfiles.hpp:596
~HmpFileO()
Destructor.
Definition: varfiles.hpp:992
VCF file output class.
Definition: varfiles.hpp:851
const double EPS
Machine .
Definition: varfiles.hpp:84
Binary file input class.
Definition: varfiles.hpp:169
~GtxtFileI()
Destructor.
Definition: varfiles.hpp:543
static const unordered_map< char, string > _tests
Genotype bit tests.
Definition: varfiles.hpp:277
HmpFile()
Default constructor.
Definition: varfiles.hpp:889
~VarFile()
Destructor.
Definition: varfiles.hpp:110
string _fileName
File name.
Definition: varfiles.hpp:466
~BedFileO()
Destructor.
Definition: varfiles.hpp:452
fstream _bimFile
Corresponding .bim file stream.
Definition: varfiles.hpp:263
TpedFileO()
Default constructor.
Definition: varfiles.hpp:739
Generic binary file base class.
Definition: varfiles.hpp:123
GtxtFileO(const string &fileName)
File name constructor.
Definition: varfiles.hpp:590
Text file input class.
Definition: varfiles.hpp:510
BedFileI()
Default constructor.
Definition: varfiles.hpp:372
uint64_t nindiv()
Number of individuals in the object.
Definition: varfiles.hpp:422
VCF file base class.
Definition: varfiles.hpp:765
GtxtFile()
Default constructor.
Definition: varfiles.hpp:472
static const vector< char > _masks
Genotype bit masks.
Definition: varfiles.hpp:271
VCF file input class.
Definition: varfiles.hpp:800
TpedFileO(const string &stubName)
File name constructor.
Definition: varfiles.hpp:744
virtual void close()=0
Close stream.
Generic text file base class.
Definition: varfiles.hpp:463
BedFileO(const string &stubName)
File name constructor.
Definition: varfiles.hpp:442
~HmpFileI()
Destructor.
Definition: varfiles.hpp:947
VarFile & operator=(const VarFile &in)=default
Copy assignment.
string _fileStub
File name stub (minus the extension)
Definition: varfiles.hpp:265
~GtxtFile()
Destructor.
Definition: varfiles.hpp:496
BED file output class.
Definition: varfiles.hpp:431
Hapmap (HMP) file base class.
Definition: varfiles.hpp:884
virtual void open()
Open stream (does nothing)
Definition: varfiles.hpp:499
GbinFile(const string &fileName, const size_t &nCols, const size_t &elemSize)
Constructor with file name.
Definition: varfiles.hpp:144
uint64_t nsnp()
Number of SNPs in the object.
Definition: varfiles.hpp:722
~VcfFileO()
Destructor.
Definition: varfiles.hpp:872
VcfFileO(const string &fileName)
File name constructor.
Definition: varfiles.hpp:862
~VcfFile()
Destructor.
Definition: varfiles.hpp:787
uint64_t nlines()
Number of SNPs in the object.
Definition: varfiles.hpp:570
VcfFile(const string &fileName)
Constructor with file name.
Definition: varfiles.hpp:776
Base variant file class.
Definition: varfiles.hpp:92
string _fileStub
File name stub (minus the extension)
Definition: varfiles.hpp:623
string _fileName
File name.
Definition: varfiles.hpp:126
bool _head
Is there a header?
Definition: varfiles.hpp:468
BED file input class.
Definition: varfiles.hpp:311
uint64_t nlines()
Number of rows in the object.
Definition: varfiles.hpp:214
Generic binary file output class.
Definition: varfiles.hpp:223
GtxtFileI()
Default constructor.
Definition: varfiles.hpp:522
Population index.
Definition: populations.hpp:44
~TpedFileI()
Destructor.
Definition: varfiles.hpp:707
const double PI
pi
Definition: varfiles.hpp:86
TpedFileI()
Default constructor.
Definition: varfiles.hpp:692
GtxtFile(const string &fileName)
Constructor with file name.
Definition: varfiles.hpp:478
GbinFile()
Default constructor.
Definition: varfiles.hpp:134
~VcfFileI()
Destructor.
Definition: varfiles.hpp:827
~GbinFileI()
Destructor.
Definition: varfiles.hpp:199
~GtxtFileO()
Destructor.
Definition: varfiles.hpp:606
VcfFileI(const string &fileName)
File name constructor.
Definition: varfiles.hpp:817
BedFileI(const string &stubName)
File name constructor.
Definition: varfiles.hpp:377
Generic text file output class.
Definition: varfiles.hpp:579
TPED file input class.
Definition: varfiles.hpp:656
fstream _famFile
Corresponding .fam file stream.
Definition: varfiles.hpp:261
HMP file output class.
Definition: varfiles.hpp:971
void open()
Open stream (does nothing)
Definition: varfiles.hpp:790
uint64_t nindiv()
Number of individuals in the object.
Definition: varfiles.hpp:724
virtual void open()=0
Open stream.
virtual void open()
Open stream (does nothing)
Definition: varfiles.hpp:645
TpedFileI(const string &stubName)
File name constructor.
Definition: varfiles.hpp:697
HmpFileO(const string &fileName)
File name constructor.
Definition: varfiles.hpp:982
VcfFileI()
Default constructor.
Definition: varfiles.hpp:812
virtual void open()
Open stream (does nothing)
Definition: varfiles.hpp:158
TpedFile()
Default constructor.
Definition: varfiles.hpp:627
uint64_t nsnp()
Number of SNPs in the object.
Definition: varfiles.hpp:420
VcfFileO()
Default constructor.
Definition: varfiles.hpp:857
size_t _elemSize
Size of each element in bytes.
Definition: varfiles.hpp:130
GtxtFileI(const string &fileName, const bool &head)
File name constructor with header specification.
Definition: varfiles.hpp:533
HmpFileO()
Default constructor.
Definition: varfiles.hpp:977
GtxtFileI(const string &fileName)
File name constructor with header specification.
Definition: varfiles.hpp:527
BED file base class.
Definition: varfiles.hpp:257
~GbinFileO()
Destructor.
Definition: varfiles.hpp:247
GbinFileI(const string &fileName, const size_t &nCols, const size_t &elemSize)
File name constructor.
Definition: varfiles.hpp:189
GbinFileO(const string &fileName, const size_t &nCols, const size_t &elemSize)
File name constructor.
Definition: varfiles.hpp:237
BedFileO()
Default constructor.
Definition: varfiles.hpp:437
~HmpFile()
Destructor.
Definition: varfiles.hpp:906
HMP file input class.
Definition: varfiles.hpp:920
~GbinFile()
Destructor.
Definition: varfiles.hpp:155
GbinFileI()
Default constructor.
Definition: varfiles.hpp:181
GbinFileO()
Default constructor.
Definition: varfiles.hpp:229
uint64_t nsnp()
Number of SNPs in the object.
Definition: varfiles.hpp:842