summaryrefslogtreecommitdiff
blob: 30a182f8f43ad410224664b7208130ee53e37550 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
--- a/doc/alphabet-format.html
+++ b/doc/alphabet-format.html
@@ -233,7 +233,7 @@
           providing a reference on the meaning of the symbols used. If present, the
           symbol name must be the second field.</p>
           <p>The &quot;<span class="pdat">name</span>&quot; follows the rules of
-          <a href="qstr">quoted text</a>.</p>
+          <a href="#qstr">quoted text</a>.</p>
         </div>
         <h5>color</h5>
         <div class="indent">
--- a/doc/release-notes.html
+++ b/doc/release-notes.html
@@ -14,8 +14,26 @@
       <h2>Motif-based sequence analysis tools</h2>
     </div>
     <h2>MEME Suite Release Notes</h2>
+    <hr>
+      <b>MEME version 4.11.2 patch 1 -- June 16, 2016</b>
+      <ul>
+        <li>
+            <b>Bug fixes</b>
+            <ul>
+              <li>
+              Fixed bug in MCAST 4.11.2 that caused it to prematurely truncate
+              reading the sequence file.
+              </li>
+              <li>
+              Modified MEME to fall back to a simple Dirichlet prior when
+              using DNA or a custom alphabet with a prior that requires
+              a prior library, but no prior libray is specified.
+              </li>
+            </ul
+        </li>
+      </ul>
+      <p>
       <hr>
-      <p>
         <b>MEME version 4.11.2 -- May 5 2016</b>
       </p>
       <ul>
--- a/src/fasta-io.c
+++ b/src/fasta-io.c
@@ -14,6 +14,7 @@
 #include "alphabet.h"
 #include "fasta-io.h"
 #include "io.h"
+#include "seq-reader-from-fasta.h"
 #include "prior-reader-from-psp.h"
 #include "seq.h"
 
@@ -159,61 +160,6 @@
 }
 
 /****************************************************************************
- * Read raw sequence until a new sequence is encountered or too many letters
- * are read.  The new sequence is appended to the end of the given
- * sequence.
- *
- * Return: Was the sequence read completely?
- ****************************************************************************/
-static BOOLEAN_T read_raw_sequence_from_reader(
-   DATA_BLOCK_READER_T *fasta_reader, // Sequence source
-   char* name, // Sequence ID (used in error messages).
-   ALPH_T* alph, // Alphabet in use
-   unsigned int offset, // Current position in raw_sequence.
-   unsigned int max_chars, // Maximum chars in raw_sequence.
-   char* raw_sequence // Pre-allocated sequence.
-) {
-  // tlb; change a_char to integer so it will compile on SGI
-  int a_char;
-  int start_update;
-  BOOLEAN_T return_value = TRUE;
-
-  // Start at the end of the given sequence.
-  assert(offset < max_chars);
-
-  DATA_BLOCK_T *seq_block = new_sequence_block(max_chars - offset);
-  return_value = !fasta_reader->get_next_block(fasta_reader, seq_block);
-
-  char *seq_buffer = get_sequence_from_data_block(seq_block);
-  size_t seq_buffer_size = get_num_read_into_data_block(seq_block);
-  int i;
-  for (i = 0; i < seq_buffer_size; ++i) {
-    a_char = seq_buffer[i];
-    // Skip non-alphabetic characters.
-    if (!isalnum(a_char) && a_char != '-' && a_char != '*' && a_char != '.') {
-      if ((a_char != ' ') && (a_char != '\t') && (a_char != '\n') && (a_char != '\r')) {
-        fprintf(stderr, "Warning: Skipping character %c in sequence %s.\n",
-                a_char, name);
-      }
-    } else {
-      // skip check if unknown alph
-      if (alph != NULL && !alph_is_known(alph, a_char)) {
-        fprintf(stderr, "Warning: Converting illegal character %c to %c ",
-                a_char, alph_wildcard(alph));
-        fprintf(stderr, "in sequence %s.\n", name);
-        a_char = alph_wildcard(alph);
-      }
-      raw_sequence[offset] = (char) a_char;
-      ++offset;
-    }
-  }
-
-  raw_sequence[offset] = '\0';
-  free_data_block(seq_block);
-  return(return_value);
-}
-
-/****************************************************************************
  * Read one sequence from a file in Fasta format.
  *
  * Return: Was a sequence successfully read?
@@ -320,44 +266,6 @@
 }
 
 /****************************************************************************
- * Read up to max_chars letters of one sequence from a DATA_BLOCK_T readder
- * and copy them in to the raw sequence in the SEQ_T object starting at the
- * given buffer offset. 
- ****************************************************************************/
-void read_one_fasta_segment_from_reader(
-   DATA_BLOCK_READER_T *fasta_reader,
-   size_t max_size,
-   size_t buffer_offset,
-   SEQ_T *sequence
-) {
-
-  assert(sequence != NULL);
-  assert(get_seq_length(sequence) <= max_size);
-
-  // Get the raw sequence buffer from the SEQ_T
-  char *raw_sequence = get_raw_sequence(sequence);
-  if (raw_sequence == NULL) {
-    // Allocate space for raw sequence if not done yet.
-    raw_sequence = mm_malloc(sizeof(char) * max_size + 1);
-    raw_sequence[0] = 0;
-  }
-
-  // Read a block of sequence charaters into the
-  // raw sequence buffer for the SEQ_T.
-  char *name = get_seq_name(sequence);
-  BOOLEAN_T is_complete = read_raw_sequence_from_reader(
-    fasta_reader,
-    name,
-    NULL, //FIXME this is dodgy, need a proper way of getting the alphabet. The fasta_reader has it but it is not accessable!
-    buffer_offset,
-    max_size,
-    raw_sequence
-  );
-  set_raw_sequence(raw_sequence, is_complete, sequence);
-
-}
-
-/****************************************************************************
  * Read all the sequences from a FASTA file at once.
    Multiple files can be appended by calling this more than once.
  ****************************************************************************/
--- a/src/fasta-io.h
+++ b/src/fasta-io.h
@@ -43,19 +43,6 @@
 );
 
 /****************************************************************************
- * Read up to max_chars letters of one sequence from a DATA_BLOCK_T readder
- * and copy them in to the raw sequence in the SEQ_T object starting at the
- * given buffer offset. 
- ****************************************************************************/
-void read_one_fasta_segment_from_reader(
-  DATA_BLOCK_READER_T *fasta_reader,
-  size_t max_size,
-  size_t buffer_offset,
-  SEQ_T* sequence
-);
-
-
-/****************************************************************************
  * Read all the sequences from a file in Fasta format.
  ****************************************************************************/
 void read_many_fastas
--- a/src/init.c
+++ b/src/init.c
@@ -767,10 +767,16 @@
       if (alph_is_builtin_protein(alph)) { // default mixture prior for proteins
         plib_name = make_path_to_file(get_meme_etc_dir(), PROTEIN_PLIB);
       } else {
-        fprintf(stderr, "The prior library must be specified for DNA or custom "
-            "alphabets when specifiying a prior type of 'dmix', 'mega' "
-            "or 'megap'.");
-        exit(1);
+        fprintf(
+          stderr, 
+          "WARNING: When using DNA or a custom alphabet, "
+          "and specifiying a prior type of\n"
+          "'dmix', 'mega' or 'megap', a prior library must be provided.\n"
+          "No prior library was provided, so a simple Dirichlet prior will be used.\n"
+        );
+        prior = "dirichlet";
+        ptype = Dirichlet;
+        if (beta <= 0) beta = 0.01; // default b = 0.01 for simple Dirichlet
       }
     }
   }
--- a/src/seq-reader-from-fasta.c
+++ b/src/seq-reader-from-fasta.c
@@ -639,11 +639,140 @@
   return fasta_reader->current_position;
 }
 
+
+/****************************************************************************
+ * Read up to max_chars letters of one sequence from a DATA_BLOCK_T readder
+ * and copy them in to the raw sequence in the SEQ_T object starting at the
+ * given buffer offset. 
+ ****************************************************************************/
+void read_one_fasta_segment_from_reader(
+   DATA_BLOCK_READER_T *fasta_reader,
+   size_t max_size,
+   size_t offset,
+   SEQ_T *sequence
+) {
+
+
+  assert(sequence != NULL);
+  assert(offset < max_size);
+
+  // Get the raw sequence buffer from the SEQ_T
+  char *raw_sequence = get_raw_sequence(sequence);
+  if (raw_sequence == NULL) {
+    // Allocate space for raw sequence if not done yet.
+    raw_sequence = mm_malloc(sizeof(char) * max_size + 1);
+    raw_sequence[0] = 0;
+  }
+
+  // Read a block of sequence charaters into the
+  // raw sequence buffer for the SEQ_T, starting at offset.
+  BOOLEAN_T is_complete = read_raw_sequence_from_reader(
+    fasta_reader,
+    max_size - offset,
+    raw_sequence + offset
+  );
+  set_raw_sequence(raw_sequence, is_complete, sequence);
+}
+
+/****************************************************************************
+ * Read raw sequence until a new sequence is encountered or too many letters
+ * are read.
+ *
+ * Return: Was the sequence read completely?
+ ****************************************************************************/
+BOOLEAN_T read_raw_sequence_from_reader(
+   DATA_BLOCK_READER_T *reader, // Sequence source
+   unsigned int max_chars, // Maximum chars in raw_sequence.
+   char* raw_sequence // Pre-allocated sequence buffer.
+) {
+
+  SEQ_READER_FROM_FASTA_T *fasta_reader 
+    = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader);
+
+  // Read sequence into temp. buffer from the sequence file.
+  char buffer[max_chars];
+  long start_file_pos = ftell(fasta_reader->fasta_file);
+  size_t seq_index = 0;
+  size_t total_read = 0;
+  while (seq_index < max_chars) {
+
+    size_t num_char_read = fread(
+      buffer,
+      sizeof(char), 
+      max_chars - seq_index,
+      fasta_reader->fasta_file
+    );
+    fasta_reader->current_position += num_char_read;
+    total_read += num_char_read;
+
+    if (feof(fasta_reader->fasta_file)) {
+       fasta_reader->at_end_of_file = TRUE;
+    }
+    else if (num_char_read < (max_chars - seq_index)) {
+      die(
+        "Error while reading sequence from file:%s.\nError message: %s\n", 
+        fasta_reader->filename,
+        strerror(ferror(fasta_reader->fasta_file))
+      );
+    }
+
+    size_t i;
+    for(i = 0; i < num_char_read; ++i) {
+      char c = buffer[i];
+      assert(c != 0);
+      if (isspace(c)) {
+        // Skip over white space
+        fasta_reader->at_start_of_line = (c == '\n');
+      }
+      else if (c == '>' && fasta_reader->at_start_of_line == TRUE) {
+        // We found the start of a new sequence while trying
+        // to fill the buffer. Leave the buffer incomplete.
+        // and wind back the file
+        fseek(fasta_reader->fasta_file, start_file_pos + i - 1, SEEK_SET);
+        fasta_reader->current_position = start_file_pos + i - 1;
+        fasta_reader->at_end_of_seq = TRUE;
+        fasta_reader->at_start_of_line = FALSE;
+        fasta_reader->at_end_of_file = FALSE;
+        break;
+      }
+      else {
+        fasta_reader->at_start_of_line = FALSE;
+        // Check that character is legal in alphabet. 
+        // If not, replace with wild card character.
+        if (alph_is_known(fasta_reader->alphabet, c)) {
+          raw_sequence[seq_index] = c;
+        }
+        else {
+          raw_sequence[seq_index] = alph_wildcard(fasta_reader->alphabet);
+          fprintf(
+            stderr, 
+            "Warning: %c is not a valid character in %s alphabet.\n"
+            "         Converting %c to %c.\n",
+            c,
+            alph_name(fasta_reader->alphabet),
+            c,
+            raw_sequence[i]
+          );
+        }
+        ++seq_index;
+      }
+    }
+    if (fasta_reader->at_end_of_seq | fasta_reader->at_end_of_file) {
+      break;
+    }
+  }
+
+  raw_sequence[seq_index] = '\0';
+  return(fasta_reader->at_end_of_seq | fasta_reader->at_end_of_file);
+}
+
 /******************************************************************************
- * Fills in the next data block for the sequence. 
- * During the first call for the sequence it fills in the full data block.
- * On successive calls, shifts the sequence in the block down one position
- * and reads one more character.
+ * Populates the data block for the with the next block of sequence. 
+ *
+ * During the first call for the sequence it fills in a buffer from a file,
+ * The sequence pointer in the data block is set to point at the start of the buffer.
+ * On successive calls, the sequence pointer in the block is shifted down one position
+ * in the buffer. When the end of the buffer is reached, it is filled again from the file.
  * 
  * Returns TRUE if it was able to completely fill the block, FALSE if 
  * the next sequence or EOF was reached before the block was filled.
--- a/src/seq-reader-from-fasta.h
+++ b/src/seq-reader-from-fasta.h
@@ -37,5 +37,30 @@
   int * end_ptr           // end position of sequence (chr:\d+-(\d+))
 );
 
+/****************************************************************************
+ * Read raw sequence until a new sequence is encountered or too many letters
+ * are read.
+ *
+ * Return: Was the sequence read completely?
+ ****************************************************************************/
+BOOLEAN_T read_raw_sequence_from_reader(
+   DATA_BLOCK_READER_T *fasta_reader, // Sequence source
+   unsigned int max_chars, // Maximum chars in raw_sequence.
+   char* raw_sequence // Pre-allocated sequence.
+);
+
+/****************************************************************************
+ * Read up to max_chars letters of one sequence from a DATA_BLOCK_T readder
+ * and copy them in to the raw sequence in the SEQ_T object starting at the
+ * given buffer offset. 
+ ****************************************************************************/
+void read_one_fasta_segment_from_reader(
+   DATA_BLOCK_READER_T *reader,
+   size_t max_size,
+   size_t offset,
+   SEQ_T *sequence
+);
+
+
 size_t get_current_pos_from_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader);
 #endif