Statistiques
| Révision :

root / ase / io / pupynere.py

Historique | Voir | Annoter | Télécharger (20,72 ko)

1
"""
2
NetCDF reader/writer module.
3

4
This module implements the Scientific.IO.NetCDF API to read and create
5
NetCDF files. The same API is also used in the PyNIO and pynetcdf
6
modules, allowing these modules to be used interchangebly when working
7
with NetCDF files. The major advantage of ``scipy.io.netcdf`` over other 
8
modules is that it doesn't require the code to be linked to the NetCDF
9
libraries as the other modules do.
10

11
The code is based on the NetCDF file format specification
12
(http://www.unidata.ucar.edu/software/netcdf/guide_15.html). A NetCDF
13
file is a self-describing binary format, with a header followed by
14
data. The header contains metadata describing dimensions, variables
15
and the position of the data in the file, so access can be done in an
16
efficient manner without loading unnecessary data into memory. We use
17
the ``mmap`` module to create Numpy arrays mapped to the data on disk,
18
for the same purpose.
19

20
The structure of a NetCDF file is as follows:
21

22
    C D F <VERSION BYTE> <NUMBER OF RECORDS>
23
    <DIMENSIONS> <GLOBAL ATTRIBUTES> <VARIABLES METADATA>
24
    <NON-RECORD DATA> <RECORD DATA>
25

26
Record data refers to data where the first axis can be expanded at
27
will. All record variables share a same dimension at the first axis,
28
and they are stored at the end of the file per record, ie
29

30
    A[0], B[0], ..., A[1], B[1], ..., etc,
31
    
32
so that new data can be appended to the file without changing its original
33
structure. Non-record data are padded to a 4n bytes boundary. Record data
34
are also padded, unless there is exactly one record variable in the file,
35
in which case the padding is dropped.  All data is stored in big endian
36
byte order.
37

38
The Scientific.IO.NetCDF API allows attributes to be added directly to
39
instances of ``netcdf_file`` and ``netcdf_variable``. To differentiate
40
between user-set attributes and instance attributes, user-set attributes
41
are automatically stored in the ``_attributes`` attribute by overloading
42
``__setattr__``. This is the reason why the code sometimes uses
43
``obj.__dict__['key'] = value``, instead of simply ``obj.key = value``;
44
otherwise the key would be inserted into userspace attributes.
45

46
To create a NetCDF file::
47

48
    >>> import time
49
    >>> f = netcdf_file('simple.nc', 'w')
50
    >>> f.history = 'Created for a test'
51
    >>> f.createDimension('time', 10)
52
    >>> time = f.createVariable('time', 'i', ('time',))
53
    >>> time[:] = range(10)
54
    >>> time.units = 'days since 2008-01-01'
55
    >>> f.close()
56

57
To read the NetCDF file we just created::
58

59
    >>> f = netcdf_file('simple.nc', 'r')
60
    >>> print f.history
61
    Created for a test
62
    >>> time = f.variables['time']
63
    >>> print time.units
64
    days since 2008-01-01
65
    >>> print time.shape
66
    (10,)
67
    >>> print time[-1]
68
    9
69
    >>> f.close()
70

71
TODO: properly implement ``_FillValue``.
72
"""
73

    
74
__all__ = ['netcdf_file', 'netcdf_variable']
75

    
76

    
77
from operator import mul
78
from mmap import mmap, ACCESS_READ
79

    
80
from numpy import fromstring, ndarray, dtype, empty, array, asarray
81
from numpy import little_endian as LITTLE_ENDIAN
82

    
83

    
84
ABSENT       = '\x00\x00\x00\x00\x00\x00\x00\x00' 
85
ZERO         = '\x00\x00\x00\x00'
86
NC_BYTE      = '\x00\x00\x00\x01'
87
NC_CHAR      = '\x00\x00\x00\x02'
88
NC_SHORT     = '\x00\x00\x00\x03'
89
NC_INT       = '\x00\x00\x00\x04'
90
NC_FLOAT     = '\x00\x00\x00\x05'
91
NC_DOUBLE    = '\x00\x00\x00\x06'
92
NC_DIMENSION = '\x00\x00\x00\n'
93
NC_VARIABLE  = '\x00\x00\x00\x0b'
94
NC_ATTRIBUTE = '\x00\x00\x00\x0c'
95

    
96

    
97
TYPEMAP = { NC_BYTE:   ('b', 1),
98
            NC_CHAR:   ('c', 1),
99
            NC_SHORT:  ('h', 2),
100
            NC_INT:    ('i', 4),
101
            NC_FLOAT:  ('f', 4),
102
            NC_DOUBLE: ('d', 8) }
103

    
104
REVERSE = { 'b': NC_BYTE,
105
            'c': NC_CHAR,
106
            'h': NC_SHORT,
107
            'i': NC_INT,
108
            'f': NC_FLOAT,
109
            'd': NC_DOUBLE,
110

    
111
            # these come from asarray(1).dtype.char and asarray('foo').dtype.char,
112
            # used when getting the types from generic attributes.
113
            'l': NC_INT,
114
            'S': NC_CHAR }
115

    
116

    
117
class netcdf_file(object):
118
    """
119
    A ``netcdf_file`` object has two standard attributes: ``dimensions`` and
120
    ``variables``. The values of both are dictionaries, mapping dimension
121
    names to their associated lengths and variable names to variables,
122
    respectively. Application programs should never modify these
123
    dictionaries.
124

125
    All other attributes correspond to global attributes defined in the
126
    NetCDF file. Global file attributes are created by assigning to an
127
    attribute of the ``netcdf_file`` object.
128

129
    """
130
    def __init__(self, filename, mode='r', mmap=True):
131
        if not __debug__:
132
            raise RuntimeError('Current version of pupynere does not ' +
133
                               'work with -O option.  We need to update ' +
134
                               'to version 1.0.7!')
135

    
136
        self.filename = filename
137
        self.use_mmap = mmap
138

    
139
        assert mode in 'rw', "Mode must be either 'r' or 'w'."
140
        self.mode = mode
141

    
142
        self.dimensions = {}
143
        self.variables = {}
144

    
145
        self._dims = []
146
        self._recs = 0
147
        self._recsize = 0
148

    
149
        self.fp = open(self.filename, '%sb' % mode)
150

    
151
        self._attributes = {}
152

    
153
        if mode is 'r':
154
            self._read()
155

    
156
    def __setattr__(self, attr, value):
157
        # Store user defined attributes in a separate dict,
158
        # so we can save them to file later.
159
        try:
160
            self._attributes[attr] = value
161
        except AttributeError:
162
            pass
163
        self.__dict__[attr] = value
164

    
165
    def close(self):
166
        if not self.fp.closed:
167
            try:
168
                self.flush()
169
            finally:
170
                self.fp.close()
171
    __del__ = close
172

    
173
    def createDimension(self, name, length):
174
        self.dimensions[name] = length
175
        self._dims.append(name)
176

    
177
    def createVariable(self, name, type, dimensions):
178
        shape = tuple([self.dimensions[dim] for dim in dimensions]) 
179
        shape_ = tuple([dim or 0 for dim in shape])  # replace None with 0 for numpy
180

    
181
        if isinstance(type, basestring): type = dtype(type)
182
        typecode, size = type.char, type.itemsize
183
        dtype_ = '>%s' % typecode
184
        if size > 1: dtype_ += str(size)
185

    
186
        data = empty(shape_, dtype=dtype_)
187
        self.variables[name] = netcdf_variable(data, typecode, shape, dimensions)
188
        return self.variables[name]
189

    
190
    def flush(self):
191
        if self.mode is 'w':
192
            self._write()
193
    sync = flush
194

    
195
    def _write(self):
196
        self.fp.write('CDF')
197

    
198
        self.__dict__['version_byte'] = 1
199
        self.fp.write(array(1, '>b').tostring())
200

    
201
        # Write headers and data.
202
        self._write_numrecs()
203
        self._write_dim_array()
204
        self._write_gatt_array()
205
        self._write_var_array()
206

    
207
    def _write_numrecs(self):
208
        # Get highest record count from all record variables.
209
        for var in self.variables.values():
210
            if var.isrec and len(var.data) > self._recs:
211
                self.__dict__['_recs'] = len(var.data)
212
        self._pack_int(self._recs)
213

    
214
    def _write_dim_array(self):
215
        if self.dimensions:
216
            self.fp.write(NC_DIMENSION)
217
            self._pack_int(len(self.dimensions))
218
            for name in self._dims:
219
                self._pack_string(name)
220
                length = self.dimensions[name]
221
                self._pack_int(length or 0)  # replace None with 0 for record dimension
222
        else:
223
            self.fp.write(ABSENT)
224

    
225
    def _write_gatt_array(self):
226
        self._write_att_array(self._attributes)
227

    
228
    def _write_att_array(self, attributes):
229
        if attributes:
230
            self.fp.write(NC_ATTRIBUTE)
231
            self._pack_int(len(attributes))
232
            for name, values in attributes.items():
233
                self._pack_string(name)
234
                self._write_values(values)
235
        else:
236
            self.fp.write(ABSENT)
237

    
238
    def _write_var_array(self):
239
        if self.variables:
240
            self.fp.write(NC_VARIABLE)
241
            self._pack_int(len(self.variables))
242

    
243
            # Sort variables non-recs first, then recs.
244
            variables = self.variables.items()
245
            if True: # Backwards compatible with Python versions < 2.4
246
                keys = [(v._shape and not v.isrec, k) for k, v in variables]
247
                keys.sort()
248
                keys.reverse()
249
                variables = [k for isrec, k in keys]
250
            else: # Python version must be >= 2.4
251
                variables.sort(key=lambda (k, v): v._shape and not v.isrec)
252
                variables.reverse()
253
                variables = [k for (k, v) in variables]
254

    
255
            # Set the metadata for all variables.
256
            for name in variables:
257
                self._write_var_metadata(name)
258
            # Now that we have the metadata, we know the vsize of
259
            # each record variable, so we can calculate recsize.
260
            self.__dict__['_recsize'] = sum([
261
                    var._vsize for var in self.variables.values()
262
                    if var.isrec])
263
            # Set the data for all variables.
264
            for name in variables:
265
                self._write_var_data(name)
266
        else:
267
            self.fp.write(ABSENT)
268

    
269
    def _write_var_metadata(self, name):
270
        var = self.variables[name]
271

    
272
        self._pack_string(name)
273
        self._pack_int(len(var.dimensions))
274
        for dimname in var.dimensions:
275
            dimid = self._dims.index(dimname)
276
            self._pack_int(dimid)
277

    
278
        self._write_att_array(var._attributes)
279

    
280
        nc_type = REVERSE[var.typecode()]
281
        self.fp.write(nc_type)
282

    
283
        if not var.isrec:
284
            vsize = var.data.size * var.data.itemsize
285
            vsize += -vsize % 4
286
        else:  # record variable
287
            try:
288
                vsize = var.data[0].size * var.data.itemsize
289
            except IndexError:
290
                vsize = 0
291
            rec_vars = len([var for var in self.variables.values()
292
                    if var.isrec])
293
            if rec_vars > 1:
294
                vsize += -vsize % 4
295
        self.variables[name].__dict__['_vsize'] = vsize
296
        self._pack_int(vsize)
297

    
298
        # Pack a bogus begin, and set the real value later.
299
        self.variables[name].__dict__['_begin'] = self.fp.tell()
300
        self._pack_begin(0)
301

    
302
    def _write_var_data(self, name):
303
        var = self.variables[name]
304
        
305
        # Set begin in file header.
306
        the_beguine = self.fp.tell()
307
        self.fp.seek(var._begin)
308
        self._pack_begin(the_beguine)
309
        self.fp.seek(the_beguine)
310

    
311
        # Write data.
312
        if not var.isrec:
313
            self.fp.write(var.data.tostring())    
314
            count = var.data.size * var.data.itemsize
315
            self.fp.write('0' * (var._vsize - count))
316
        else:  # record variable
317
            # Handle rec vars with shape[0] < nrecs.
318
            if self._recs > len(var.data):
319
                shape = (self._recs,) + var.data.shape[1:]
320
                var.data.resize(shape)
321

    
322
            pos0 = pos = self.fp.tell()
323
            for rec in var.data:
324
                # Apparently scalars cannot be converted to big endian. If we
325
                # try to convert a ``=i4`` scalar to, say, '>i4' the dtype
326
                # will remain as ``=i4``.
327
                if not rec.shape and (rec.dtype.byteorder == '<' or
328
                        (rec.dtype.byteorder == '=' and LITTLE_ENDIAN)):
329
                    rec = rec.byteswap()
330
                self.fp.write(rec.tostring())
331
                # Padding
332
                count = rec.size * rec.itemsize
333
                self.fp.write('0' * (var._vsize - count))
334
                pos += self._recsize
335
                self.fp.seek(pos)
336
            self.fp.seek(pos0 + var._vsize)
337

    
338
    def _write_values(self, values):
339
        values = asarray(values) 
340
        values = values.astype(values.dtype.newbyteorder('>'))
341

    
342
        nc_type = REVERSE[values.dtype.char]
343
        self.fp.write(nc_type)
344

    
345
        if values.dtype.char == 'S':
346
            nelems = values.itemsize
347
        else:
348
            nelems = values.size
349
        self._pack_int(nelems)
350

    
351
        if not values.shape and (values.dtype.byteorder == '<' or
352
                (values.dtype.byteorder == '=' and LITTLE_ENDIAN)):
353
            values = values.byteswap()
354
        self.fp.write(values.tostring())
355
        count = values.size * values.itemsize
356
        self.fp.write('0' * (-count % 4))  # pad
357

    
358
    def _read(self):
359
        # Check magic bytes and version
360
        assert self.fp.read(3) == 'CDF', "Error: %s is not a valid NetCDF 3 file" % self.filename
361
        self.__dict__['version_byte'] = fromstring(self.fp.read(1), '>b')[0]
362

    
363
        # Read file headers and set data.
364
        self._read_numrecs()
365
        self._read_dim_array()
366
        self._read_gatt_array()
367
        self._read_var_array()
368

    
369
    def _read_numrecs(self):
370
        self.__dict__['_recs'] = self._unpack_int()
371

    
372
    def _read_dim_array(self):
373
        assert self.fp.read(4) in [ZERO, NC_DIMENSION]
374
        count = self._unpack_int()
375

    
376
        for dim in range(count):
377
            name = self._unpack_string()
378
            length = self._unpack_int() or None  # None for record dimension
379
            self.dimensions[name] = length
380
            self._dims.append(name)  # preserve order
381

    
382
    def _read_gatt_array(self):
383
        for k, v in self._read_att_array().items():
384
            self.__setattr__(k, v)
385

    
386
    def _read_att_array(self):
387
        assert self.fp.read(4) in [ZERO, NC_ATTRIBUTE]
388
        count = self._unpack_int()
389

    
390
        attributes = {}
391
        for attr in range(count):
392
            name = self._unpack_string()
393
            attributes[name] = self._read_values()
394
        return attributes
395

    
396
    def _read_var_array(self):
397
        assert self.fp.read(4) in [ZERO, NC_VARIABLE]
398

    
399
        begin = 0
400
        dtypes = {'names': [], 'formats': []}
401
        rec_vars = []
402
        count = self._unpack_int()
403
        for var in range(count):
404
            name, dimensions, shape, attributes, typecode, size, dtype_, begin_, vsize = self._read_var()
405
            if shape and shape[0] is None:
406
                rec_vars.append(name)
407
                self.__dict__['_recsize'] += vsize
408
                if begin == 0: begin = begin_
409
                dtypes['names'].append(name)
410
                dtypes['formats'].append(str(shape[1:]) + dtype_)
411

    
412
                # Handle padding with a virtual variable.
413
                if typecode in 'bch':
414
                    actual_size = reduce(mul, (1,) + shape[1:]) * size
415
                    padding = -actual_size % 4
416
                    if padding:
417
                        dtypes['names'].append('_padding_%d' % var)
418
                        dtypes['formats'].append('(%d,)>b' % padding)
419

    
420
                # Data will be set later.
421
                data = None
422
            else:
423
                if self.use_mmap:
424
                    mm = mmap(self.fp.fileno(), begin_+vsize, access=ACCESS_READ)
425
                    data = ndarray.__new__(ndarray, shape, dtype=dtype_,
426
                            buffer=mm, offset=begin_, order=0)
427
                else:
428
                    pos = self.fp.tell()
429
                    self.fp.seek(begin_)
430
                    data = fromstring(self.fp.read(vsize), dtype=dtype_)
431
                    data.shape = shape
432
                    self.fp.seek(pos)
433

    
434
            # Add variable.
435
            self.variables[name] = netcdf_variable(
436
                    data, typecode, shape, dimensions, attributes)
437

    
438
        if rec_vars:
439
            # Remove padding when only one record variable.
440
            if len(rec_vars) == 1:
441
                dtypes['names'] = dtypes['names'][:1]
442
                dtypes['formats'] = dtypes['formats'][:1]
443

    
444
            # Build rec array.
445
            if self.use_mmap:
446
                mm = mmap(self.fp.fileno(), begin+self._recs*self._recsize, access=ACCESS_READ)
447
                rec_array = ndarray.__new__(ndarray, (self._recs,), dtype=dtypes,
448
                        buffer=mm, offset=begin, order=0)
449
            else:
450
                pos = self.fp.tell()
451
                self.fp.seek(begin)
452
                rec_array = fromstring(self.fp.read(self._recs*self._recsize), dtype=dtypes)
453
                rec_array.shape = (self._recs,)
454
                self.fp.seek(pos)
455

    
456
            for var in rec_vars:
457
                self.variables[var].__dict__['data'] = rec_array[var]
458

    
459
    def _read_var(self):
460
        name = self._unpack_string()
461
        dimensions = []
462
        shape = []
463
        dims = self._unpack_int()
464
        
465
        for i in range(dims):
466
            dimid = self._unpack_int()
467
            dimname = self._dims[dimid]
468
            dimensions.append(dimname)
469
            dim = self.dimensions[dimname]
470
            shape.append(dim)
471
        dimensions = tuple(dimensions)
472
        shape = tuple(shape)
473

    
474
        attributes = self._read_att_array()
475
        nc_type = self.fp.read(4)
476
        vsize = self._unpack_int()
477
        begin = [self._unpack_int, self._unpack_int64][self.version_byte-1]()
478

    
479
        typecode, size = TYPEMAP[nc_type]
480
        if typecode is 'c':
481
            dtype_ = '>c'
482
        else:
483
            dtype_ = '>%s' % typecode
484
            if size > 1: dtype_ += str(size)
485

    
486
        return name, dimensions, shape, attributes, typecode, size, dtype_, begin, vsize
487

    
488
    def _read_values(self):
489
        nc_type = self.fp.read(4)
490
        n = self._unpack_int()
491

    
492
        typecode, size = TYPEMAP[nc_type]
493

    
494
        count = n*size
495
        values = self.fp.read(count)
496
        self.fp.read(-count % 4)  # read padding
497

    
498
        if typecode is not 'c':
499
            values = fromstring(values, dtype='>%s%d' % (typecode, size))
500
            if values.shape == (1,): values = values[0]
501
        else:
502
            values = values.rstrip('\x00') 
503
        return values
504

    
505
    def _pack_begin(self, begin):
506
        if self.version_byte == 1:
507
            self._pack_int(begin)
508
        elif self.version_byte == 2:
509
            self._pack_int64(begin)
510

    
511
    def _pack_int(self, value):
512
        self.fp.write(array(value, '>i').tostring())
513
    _pack_int32 = _pack_int
514

    
515
    def _unpack_int(self):
516
        return int(fromstring(self.fp.read(4), '>i')[0])
517
    _unpack_int32 = _unpack_int
518

    
519
    def _pack_int64(self, value):
520
        self.fp.write(array(value, '>q').tostring())
521

    
522
    def _unpack_int64(self):
523
        return int(fromstring(self.fp.read(8), '>q')[0])
524

    
525
    def _pack_string(self, s):
526
        count = len(s)
527
        self._pack_int(count)
528
        self.fp.write(s)
529
        self.fp.write('0' * (-count % 4))  # pad
530

    
531
    def _unpack_string(self):
532
        count = self._unpack_int()
533
        s = self.fp.read(count).rstrip('\x00')
534
        self.fp.read(-count % 4)  # read padding
535
        return s
536

    
537

    
538
class netcdf_variable(object):
539
    """
540
    ``netcdf_variable`` objects are constructed by calling the method
541
    ``createVariable`` on the netcdf_file object.
542

543
    ``netcdf_variable`` objects behave much like array objects defined in
544
    Numpy, except that their data resides in a file. Data is read by
545
    indexing and written by assigning to an indexed subset; the entire
546
    array can be accessed by the index ``[:]`` or using the methods
547
    ``getValue`` and ``assignValue``. ``netcdf_variable`` objects also
548
    have attribute ``shape`` with the same meaning as for arrays, but
549
    the shape cannot be modified. There is another read-only attribute
550
    ``dimensions``, whose value is the tuple of dimension names.
551

552
    All other attributes correspond to variable attributes defined in
553
    the NetCDF file. Variable attributes are created by assigning to an
554
    attribute of the ``netcdf_variable`` object.
555

556
    """
557
    def __init__(self, data, typecode, shape, dimensions, attributes=None):
558
        self.data = data
559
        self._typecode = typecode
560
        self._shape = shape
561
        self.dimensions = dimensions
562

    
563
        self._attributes = attributes or {}
564
        for k, v in self._attributes.items():
565
            self.__dict__[k] = v
566

    
567
    def __setattr__(self, attr, value):
568
        # Store user defined attributes in a separate dict,
569
        # so we can save them to file later.
570
        try:
571
            self._attributes[attr] = value
572
        except AttributeError:
573
            pass
574
        self.__dict__[attr] = value
575

    
576
    def isrec(self):
577
        return self.data.shape and not self._shape[0]
578
    isrec = property(isrec)
579

    
580
    def shape(self):
581
        return self.data.shape
582
    shape = property(shape)
583
    
584
    def getValue(self):
585
        return self.data.item()
586

    
587
    def assignValue(self, value):
588
        self.data.itemset(value)
589

    
590
    def typecode(self):
591
        return self._typecode
592

    
593
    def __getitem__(self, index):
594
        return self.data[index]
595

    
596
    def __setitem__(self, index, data):
597
        # Expand data for record vars?
598
        if self.isrec:
599
            if isinstance(index, tuple):
600
                rec_index = index[0]
601
            else:
602
                rec_index = index
603
            if isinstance(rec_index, slice):
604
                recs = (rec_index.start or 0) + len(data)
605
            else:
606
                recs = rec_index + 1
607
            if recs > len(self.data):
608
                shape = (recs,) + self._shape[1:]
609
                self.data.resize(shape)
610
        self.data[index] = data
611

    
612

    
613
NetCDFFile = netcdf_file
614
NetCDFVariable = netcdf_variable