-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
363 lines (318 loc) · 8.06 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
package main
import (
"bufio"
"encoding/binary"
"errors"
"flag"
"fmt"
"log"
"os"
"time"
)
const programVersion = "0.0.1"
type compressOptions struct {
bitsPerBase int
bitsPerQual int
bytesPerID int
}
type fastqRead struct {
readID string
seq []int
qualities []int
}
func (read fastqRead) compressedSeq(bitsPerBase int) []byte {
return compressIntSlice(read.seq, bitsPerBase)
}
func (read fastqRead) compressedQual(bitsPerQual int) []byte {
return compressIntSlice(read.qualities, bitsPerQual)
}
func (read fastqRead) byteID(capacity int) ([]byte, error) {
paddingLength := capacity - len(read.readID)
if paddingLength < 0 {
return nil, errors.New("Read ID is too large")
}
byteID := []byte(read.readID)
if paddingLength > 0 {
padding := make([]byte, paddingLength) // defaults = 0
byteID = append(byteID, padding...)
}
return byteID, nil
}
// Uses three-bit encoding for a DNA base
// returns 4 for everything that is not [ACTG]
func threeBitDNA(base rune) int {
var result int
switch base {
case 'A':
result = 0
case 'T':
result = 1
case 'C':
result = 2
case 'G':
result = 3
default:
result = 4
}
return result
}
// Uses two-bit encoding for a DNA base
// effectively squashes all non-[ACTG] bases to a G.
func twoBitDNA(base rune) int {
var result int
switch base {
case 'A':
result = 0
case 'T':
result = 1
case 'C':
result = 2
case 'G':
result = 3
default:
result = 3
}
return result
}
func blockQual(qual int) int {
var result int
switch {
case qual < 2:
result = 0
case qual < 26:
result = 1 // decodes to 2
case qual < 31:
result = 2 // decodes to 26
case qual < 41:
result = 3 // decodes to 31
case qual >= 41:
result = 4 // decodes to 41
}
return result
}
func compressIntSlice(s []int, bitsPerItem int) []byte {
var boolSlice []bool
for _, item := range s {
slice, err := uint8ToBoolSlice(uint8(item), bitsPerItem)
if err != nil {
log.Fatalln("Could not compress seq")
}
boolSlice = append(boolSlice, slice...)
}
amountToPad := len(boolSlice) % 8
padding := make([]bool, amountToPad) // default value is false
boolSlice = append(boolSlice, padding...)
nBytes := len(boolSlice) / 8
compressed := make([]byte, nBytes)
for i := 0; i < nBytes; i++ {
start := i * 8
end := (i + 1) * 8
oneByteSlice := boolSlice[start:end]
oneByte, _ := boolSliceToByte(oneByteSlice)
compressed[i] = oneByte
}
return compressed
}
func reverseSliceB(s []bool) []bool {
for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
s[i], s[j] = s[j], s[i]
}
return s
}
// returns a byte from a slice of booleans
func boolSliceToByte(s []bool) (byte, error) {
if len(s) != 8 {
return byte(0), errors.New("Slice must have length 8")
}
var result uint8
reversed := reverseSliceB(s)
for idx, el := range reversed {
if el {
result += uint8(1) << uint8(idx)
}
}
return result, nil
}
// convert a uint8 to to a slice of booleans
// capacity controls how large the slice will become
// capacity is maximally 8.
func uint8ToBoolSlice(b uint8, capacity int) ([]bool, error) {
if capacity > 8 || capacity < 1 {
return nil, errors.New("Capacity must be between 1 and 8")
}
result := make([]bool, 0, capacity)
for i := capacity - 1; i >= 0; i-- {
bit := b >> uint8(i) & 1
if bit == 1 {
result = append(result, true)
} else {
result = append(result, false)
}
}
return result, nil
}
// returns a slice of integers representing the decoded quality
// string of a fastq read
func decodeQualitryString(s string, blockQuals bool) []int {
result := make([]int, len(s))
for idx, item := range s {
quality := int(item) - 33 // qualities are offset by 33
if blockQuals {
result[idx] = blockQual(quality)
} else {
result[idx] = quality
}
}
return result
}
// return DNA sequence as slice of ints.
// errors when using unknown number of bits per base
func seqStringToInts(s string, bitsPerBase int) ([]int, error) {
if bitsPerBase != 2 && bitsPerBase != 3 {
return nil, errors.New("Must use 2 or 3 bits per base")
}
result := make([]int, len(s))
for idx, runeValue := range s {
var intValue int
if bitsPerBase == 2 {
intValue = twoBitDNA(runeValue)
} else if bitsPerBase == 3 {
intValue = threeBitDNA(runeValue)
}
result[idx] = intValue
}
return result, nil
}
func fastqReadFromBucket(bucket []string, opts compressOptions) (fastqRead, error) {
var read fastqRead
if len(bucket) != 3 {
return read, errors.New("Read must consist of 3 strings")
}
readID := bucket[0]
seq, err := seqStringToInts(bucket[1], opts.bitsPerBase)
if err != nil {
return read, err
}
var block bool
if opts.bitsPerQual == 6 {
block = false
} else {
block = true
}
qualities := decodeQualitryString(bucket[2], block)
read = fastqRead{readID: readID, seq: seq, qualities: qualities}
return read, nil
}
func compressFastqBucket(bucket []string, opts compressOptions) []byte {
read, err := fastqReadFromBucket(bucket, opts)
if err != nil {
log.Fatalln(err)
}
var readAsBytes []byte
// IDs do not get stored at all when bytesPerID is zero
if opts.bytesPerID > 0 {
readID, _ := read.byteID(opts.bytesPerID)
readAsBytes = append(readAsBytes, readID...)
}
readAsBytes = append(readAsBytes, read.compressedSeq(opts.bitsPerBase)...)
readAsBytes = append(readAsBytes, read.compressedQual(opts.bitsPerQual)...)
return readAsBytes
}
func utcTime() string {
loc, _ := time.LoadLocation("UTC")
return time.Now().In(loc).String()
}
func createHeader(capacity int, bitsPerBase int) ([]byte, error) {
var header string
programLines := fmt.Sprintf("Program: fastqube\nVersion: %s\n", programVersion)
modeLine := "Mode: LOSSLESS\n"
encodingLine := fmt.Sprintf("Encoding:\n\tSequence: %d bit\n\tQualities: 6 bit\n", bitsPerBase)
capacityLine := "Capacities:\n\tHeader: 4096 bytes\n\tRead IDs: 64 bytes\n"
dateLine := fmt.Sprintf("Date: %s\n", utcTime())
header = programLines + modeLine + encodingLine + capacityLine + dateLine
byteHeader := []byte(header)
paddingLength := capacity - len(byteHeader)
if paddingLength < 0 {
return nil, errors.New("Header too long")
} else if paddingLength > 0 {
padding := make([]byte, paddingLength)
byteHeader = append(byteHeader, padding...)
}
return byteHeader, nil
}
func compressPath(path string, opts compressOptions) {
file, err := os.Open(path)
if err != nil {
log.Fatalln(err)
}
header, _ := createHeader(4096, opts.bitsPerBase)
binary.Write(os.Stdout, binary.BigEndian, header)
defer file.Close()
bucket := make([]string, 0, 3) // hold bucket of strings, representing a read
scanner := bufio.NewScanner(file)
for scanner.Scan() {
if len(bucket) == 3 {
compressed := compressFastqBucket(bucket, opts)
binary.Write(os.Stdout, binary.BigEndian, compressed)
bucket = make([]string, 0, 3)
}
currentLine := scanner.Text()
if currentLine != "+" {
bucket = append(bucket, currentLine)
}
}
if err := scanner.Err(); err != nil {
log.Fatalln(err)
}
// process final bucket
if len(bucket) == 3 {
compressed := compressFastqBucket(bucket, opts)
binary.Write(os.Stdout, binary.BigEndian, compressed)
}
}
func main() {
var (
compress bool
decompress bool
filePath string
twoBitEncoding bool
bitsPerBase int
blockQualities bool
bitsPerQual int
bytesPerID int
)
flag.BoolVar(&compress, "c", false, "Compress")
flag.BoolVar(&decompress, "d", false, "Decompress")
flag.BoolVar(&twoBitEncoding, "2", false, "2Bit-encoding")
flag.BoolVar(&blockQualities, "b", false, "Block Qualities")
flag.IntVar(&bytesPerID, "B", 64, "Bytes per ID")
flag.Parse()
tail := flag.Args()
if len(tail) > 0 {
// parse the file
filePath = tail[0]
} else {
log.Fatalln("No input file specified")
}
if compress && decompress {
log.Fatalln("Cannot set both -c and -d")
}
if !compress && !decompress {
// setting compress to true as default for the group
compress = true
}
if twoBitEncoding {
bitsPerBase = 2
} else {
bitsPerBase = 3
}
if blockQualities {
bitsPerQual = 3
} else {
bitsPerQual = 6
}
opts := compressOptions{bitsPerBase, bitsPerQual, bytesPerID}
if compress {
compressPath(filePath, opts)
}
}