Skip to content

Commit

Permalink
add input file byte range read limit handling
Browse files Browse the repository at this point in the history
  • Loading branch information
Chad Trabant committed Feb 19, 2015
1 parent 47df697 commit 42b6e10
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 14 deletions.
6 changes: 5 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
2015.050: 3.17
- Each input file may be specified with a byte range to read. Useful
to limit dataselect's operation to specific parts of files.

2015.030: 3.16
- Optimize dealing with duplication when pruning traces by changing
- Optimize dealing with duplication when pruning traces by changing
findcoverage() to skip trace entries that are already represented
in the coverage.
- Optimize search for coverage in trimtrace() by avoiding multiple
Expand Down
22 changes: 21 additions & 1 deletion doc/dataselect.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH DATASELECT 1 2014/09/12
.TH DATASELECT 1 2015/02/19
.SH NAME
Mini-SEED data selection, sorting and pruning

Expand Down Expand Up @@ -37,6 +37,11 @@ Files on the command line prefixed with a '@' character are input list
files and are expected to contain a simple list of input files, see
\fBINPUT LIST FILE\fP for more details.

Each input file may be specified with an explict byte range to read.
The program will begin reading at the specified start offset and stop
reading at the specified end range. See \fBINPUT FILE RANGE\fP for
more details.

When a input file is full SEED including both SEED headers and data
records all of the headers will be skipped and completely unprocessed.

Expand Down Expand Up @@ -275,6 +280,21 @@ data/day2.mseed
data/day3.mseed
.fi

.SH "INPUT FILE RANGE"
Each input file may be specified with an associated byte range to
read. The program will begin reading at the specified start offset
and finish reading when at or beyond the end offset. The range is
specified by appending an '@' charater to the filename with the start
and end offsets separated by a colon:

.nf
filename.mseed@<startoffset>:<endoffset>
.fi

For example: "filename.mseed:4096:8192". Both the start and end
offsets are optional. The colon separator is optional if no end
offset is specified.

.SH "MATCH OR REJECT LIST FILE"
A list file used with either the \fB-M\fP or \fB-R\fP contains a list
of regular expressions (one on each line) that will be combined into a
Expand Down
81 changes: 69 additions & 12 deletions src/dataselect.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
*
* Written by Chad Trabant, IRIS Data Management Center.
*
* modified 2015.130
* modified 2015.050
***************************************************************************/

/***************************************************************************
Expand Down Expand Up @@ -100,6 +100,8 @@
/* _ISOC9X_SOURCE needed to get a declaration for llabs on some archs */
#define _ISOC9X_SOURCE

#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand All @@ -115,21 +117,23 @@

#include "dsarchive.h"

#define VERSION "3.16"
#define VERSION "3.17"
#define PACKAGE "dataselect"

/* Input/output file information containers */
/* Input/output file selection information containers */
typedef struct Filelink_s {
char *infilename; /* Input file name */
FILE *infp; /* Input file descriptor */
char *outfilename; /* Output file name */
FILE *outfp; /* Output file descriptor */
uint64_t startoffset; /* Byte offset to start reading, 0 = unused */
uint64_t endoffset; /* Byte offset to end reading, 0 = unused */
int reordercount; /* Number of records re-ordered */
int recsplitcount; /* Number of records split */
int recrmcount; /* Number of records removed */
int rectrimcount; /* Number of records trimmed */
hptime_t earliest; /* Earliest data time in this file */
hptime_t latest; /* Latest data time in this file */
hptime_t earliest; /* Earliest data time in this file selection */
hptime_t latest; /* Latest data time in this file selection */
int byteswritten; /* Number of bytes written out */
struct Filelink_s *next;
} Filelink;
Expand Down Expand Up @@ -259,7 +263,7 @@ main ( int argc, char **argv )
/* Data stream archiving maximum concurrent open files */
if ( archiveroot )
ds_maxopenfiles = 50;

/* Init written MSTraceList */
if ( writtenfile )
if ( (writtentl = mstl_init (writtentl)) == NULL )
Expand Down Expand Up @@ -386,17 +390,40 @@ readfiles (MSTraceList **ppmstl)

if ( verbose )
{
if ( replaceinput )
ms_log (1, "Reading: %s (was %s)\n", flp->infilename, flp->outfilename);
if ( replaceinput )
{
if ( flp->startoffset || flp->endoffset )
ms_log (1, "Reading: %s (was %s) [range %"PRIu64":%"PRIu64"]\n",
flp->infilename, flp->outfilename, flp->startoffset, flp->endoffset);
else
ms_log (1, "Reading: %s (was %s)\n",
flp->infilename, flp->outfilename);
}
else
ms_log (1, "Reading: %s\n", flp->infilename);
{
if ( flp->startoffset || flp->endoffset )
ms_log (1, "Reading: %s [range %"PRIu64":%"PRIu64"]\n",
flp->infilename, flp->startoffset, flp->endoffset);
else
ms_log (1, "Reading: %s\n", flp->infilename);
}
}

/* Instruct libmseed to start at specified offset by setting a negative file position */
fpos = - flp->startoffset; /* Unset value is a 0, making this a non-operation */

/* Loop over the input file */
while ( (retcode = ms_readmsr_main (&msfp, &msr, flp->infilename, reclen, &fpos, NULL, 1, 0, selections, verbose-2))
== MS_NOERROR )
{
recstarttime = msr->starttime;
/* Break out as EOF if we have read past end offset */
if ( flp->endoffset > 0 && fpos >= flp->endoffset )
{
retcode = MS_ENDOFFILE;
break;
}

recstarttime = msr->starttime;
recendtime = msr_endtime (msr);

/* Generate the srcname with the quality code */
Expand Down Expand Up @@ -728,7 +755,14 @@ readfiles (MSTraceList **ppmstl)

totalrecs++;
totalsamps += msr->samplecnt;
} /* End of looping through records in file */

/* Break out as EOF if record is at or beyond end offset */
if ( flp->endoffset > 0 && (fpos + msr->reclen) >= flp->endoffset )
{
retcode = MS_ENDOFFILE;
break;
}
} /* End of looping through records in file */

/* Critical error if file was not read properly */
if ( retcode != MS_ENDOFFILE )
Expand Down Expand Up @@ -3087,12 +3121,19 @@ setofilelimit (int limit)
*
* Add file to end of the specified file list.
*
* Check for and parse start and end byte offsets (a read range)
* embedded in the file name. The form for specifying a read range is:
* filename@startoffset:endoffset
* where both start and end offsets are optional.
* Returns 0 on success and -1 on error.
***************************************************************************/
static int
addfile (char *filename)
{
Filelink *newlp;
char *at;
char *colon;

if ( ! filename )
{
Expand All @@ -3106,6 +3147,22 @@ addfile (char *filename)
return -1;
}

/* Check for optional read byte range specifiers
* Expected form: "filename@startoffset:endoffset"
* Both start are optional */
if ( (at = strrchr (filename, '@')) )
{
*at++ = '\0';

if ( (colon = strrchr (at, ':')) )
{
*colon++ = '\0';
newlp->endoffset = strtoull (colon, NULL, 10);
}

newlp->startoffset = strtoull (at, NULL, 10);
}

if ( ! (newlp->infilename = strdup(filename)) )
{
ms_log (2, "addfile(): Cannot duplicate string\n");
Expand Down Expand Up @@ -3136,7 +3193,7 @@ addfile (char *filename)
* Returns count of files added on success and -1 on error.
***************************************************************************/
static int
addlistfile (char *filename)
addlistfile (char *filename)
{
FILE *fp;
char filelistent[1024];
Expand Down

0 comments on commit 42b6e10

Please sign in to comment.