#include #include #include #define BUFFER_CAPACITY 1024 * 1024 * 25 #define INDICES_EXPECTED 32 #define INDICES_MAXIMUM 40 #define INDICES_WRITE 16 #define PRINT_FREQUENCY 16 int main (int argc, char* argv[]) { unsigned char* bufferInput; unsigned char* bufferLast; unsigned char* bufferPrev; unsigned long bufferSize; unsigned char* bufferSwap; unsigned long indicesArray[INDICES_MAXIMUM + 1]; unsigned long indicesCount; unsigned long indicesWrite[INDICES_WRITE] = {0, 1, 14, 7, 8, 16, 10, 12, 13, 11, 9, 25, 26, 27, 17, 18}; FILE* streamWriter; FILE* streamReader; unsigned long block; unsigned long dupes; unsigned long lines; unsigned long size; unsigned long i; unsigned long j; if (argc < 3) { printf ("usage: %s \n", argv[0]); return 0; } streamReader = fopen (argv[1], "rb"); if (!streamReader) { fprintf (stderr, "cannot open input file \"%s\" for reading\n", argv[1]); return 1; } streamWriter = fopen (argv[2], "wb"); if (!streamWriter) { fprintf (stderr, "cannot open output file \"%s\" for writing\n", argv[2]); fclose (streamReader); return 1; } bufferInput = malloc (sizeof (*bufferInput) * BUFFER_CAPACITY * 3); bufferLast = bufferInput + BUFFER_CAPACITY; bufferPrev = bufferLast + BUFFER_CAPACITY; bufferSize = 0; indicesArray[0] = 0; indicesCount = 1; dupes = 0; lines = 0; printf ("starting process...\n"); for (i = 0; i < bufferSize || !feof (streamReader); ++i) { // End of buffer reached: content must be shifted to the left before // buffer is populated with incoming data from input stream if (i == bufferSize) { i = bufferSize - indicesArray[0]; memmove (bufferInput, bufferInput + indicesArray[0], i * sizeof (*bufferInput)); bufferSize = i + fread (bufferInput + i, sizeof (*bufferInput), BUFFER_CAPACITY - i, streamReader); for (j = indicesCount; j--; ) indicesArray[j] -= indicesArray[0]; if (i >= bufferSize) break; } // End of item found: save starting index of the next one if (bufferInput[i] == '|') { if (indicesCount < INDICES_MAXIMUM) indicesArray[indicesCount++] = i + 1; } // End of line found: write required indices and flush list else if (bufferInput[i] < ' ') { if (indicesCount == INDICES_EXPECTED) { indicesArray[indicesCount] = i + 1; size = 0; for (j = 0; j + 1 < INDICES_WRITE; ++j) { block = indicesArray[indicesWrite[j] + 1] - indicesArray[indicesWrite[j]] - 1; memcpy (bufferLast + size, bufferInput + indicesArray[indicesWrite[j]], block * sizeof (*bufferInput)); size += block; bufferLast[size++] = '|'; } if (j < INDICES_WRITE) { block = indicesArray[indicesWrite[j] + 1] - indicesArray[indicesWrite[j]] - 1; memcpy (bufferLast + size, bufferInput + indicesArray[indicesWrite[j]], block * sizeof (*bufferInput)); size += block; bufferLast[size++] = 0; } if (size > 0) { if (memcmp (bufferLast, bufferPrev, size * sizeof (*bufferLast)) != 0) { if ((++lines & (((unsigned)1 << PRINT_FREQUENCY) - 1)) == 0) printf ("writing line %lu (%lu duplicate(s))...\n", lines, dupes); fwrite (bufferLast, sizeof (*bufferLast), size - 1, streamWriter); bufferSwap = bufferLast; bufferLast = bufferPrev; bufferPrev = bufferSwap; } else ++dupes; } } fwrite (bufferInput + i, sizeof (*bufferInput), 1, streamWriter); indicesArray[0] = i + 1; indicesCount = 1; } } if ((lines & (((unsigned)1 << PRINT_FREQUENCY) - 1)) != 0) printf ("writing line %lu (%lu duplicate(s))...\n", lines, dupes); printf ("done.\n"); fclose (streamWriter); fclose (streamReader); free (bufferInput); return 0; }