1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

""" 

Binary serialization 

 

NPY format 

========== 

 

A simple format for saving numpy arrays to disk with the full 

information about them. 

 

The ``.npy`` format is the standard binary file format in NumPy for 

persisting a *single* arbitrary NumPy array on disk. The format stores all 

of the shape and dtype information necessary to reconstruct the array 

correctly even on another machine with a different architecture. 

The format is designed to be as simple as possible while achieving 

its limited goals. 

 

The ``.npz`` format is the standard format for persisting *multiple* NumPy 

arrays on disk. A ``.npz`` file is a zip file containing multiple ``.npy`` 

files, one for each array. 

 

Capabilities 

------------ 

 

- Can represent all NumPy arrays including nested record arrays and 

object arrays. 

 

- Represents the data in its native binary form. 

 

- Supports Fortran-contiguous arrays directly. 

 

- Stores all of the necessary information to reconstruct the array 

including shape and dtype on a machine of a different 

architecture. Both little-endian and big-endian arrays are 

supported, and a file with little-endian numbers will yield 

a little-endian array on any machine reading the file. The 

types are described in terms of their actual sizes. For example, 

if a machine with a 64-bit C "long int" writes out an array with 

"long ints", a reading machine with 32-bit C "long ints" will yield 

an array with 64-bit integers. 

 

- Is straightforward to reverse engineer. Datasets often live longer than 

the programs that created them. A competent developer should be 

able to create a solution in their preferred programming language to 

read most ``.npy`` files that he has been given without much 

documentation. 

 

- Allows memory-mapping of the data. See `open_memmep`. 

 

- Can be read from a filelike stream object instead of an actual file. 

 

- Stores object arrays, i.e. arrays containing elements that are arbitrary 

Python objects. Files with object arrays are not to be mmapable, but 

can be read and written to disk. 

 

Limitations 

----------- 

 

- Arbitrary subclasses of numpy.ndarray are not completely preserved. 

Subclasses will be accepted for writing, but only the array data will 

be written out. A regular numpy.ndarray object will be created 

upon reading the file. 

 

.. warning:: 

 

Due to limitations in the interpretation of structured dtypes, dtypes 

with fields with empty names will have the names replaced by 'f0', 'f1', 

etc. Such arrays will not round-trip through the format entirely 

accurately. The data is intact; only the field names will differ. We are 

working on a fix for this. This fix will not require a change in the 

file format. The arrays with such structures can still be saved and 

restored, and the correct dtype may be restored by using the 

``loadedarray.view(correct_dtype)`` method. 

 

File extensions 

--------------- 

 

We recommend using the ``.npy`` and ``.npz`` extensions for files saved 

in this format. This is by no means a requirement; applications may wish 

to use these file formats but use an extension specific to the 

application. In the absence of an obvious alternative, however, 

we suggest using ``.npy`` and ``.npz``. 

 

Version numbering 

----------------- 

 

The version numbering of these formats is independent of NumPy version 

numbering. If the format is upgraded, the code in `numpy.io` will still 

be able to read and write Version 1.0 files. 

 

Format Version 1.0 

------------------ 

 

The first 6 bytes are a magic string: exactly ``\\x93NUMPY``. 

 

The next 1 byte is an unsigned byte: the major version number of the file 

format, e.g. ``\\x01``. 

 

The next 1 byte is an unsigned byte: the minor version number of the file 

format, e.g. ``\\x00``. Note: the version of the file format is not tied 

to the version of the numpy package. 

 

The next 2 bytes form a little-endian unsigned short int: the length of 

the header data HEADER_LEN. 

 

The next HEADER_LEN bytes form the header data describing the array's 

format. It is an ASCII string which contains a Python literal expression 

of a dictionary. It is terminated by a newline (``\\n``) and padded with 

spaces (``\\x20``) to make the total of 

``len(magic string) + 2 + len(length) + HEADER_LEN`` be evenly divisible 

by 64 for alignment purposes. 

 

The dictionary contains three keys: 

 

"descr" : dtype.descr 

An object that can be passed as an argument to the `numpy.dtype` 

constructor to create the array's dtype. 

"fortran_order" : bool 

Whether the array data is Fortran-contiguous or not. Since 

Fortran-contiguous arrays are a common form of non-C-contiguity, 

we allow them to be written directly to disk for efficiency. 

"shape" : tuple of int 

The shape of the array. 

 

For repeatability and readability, the dictionary keys are sorted in 

alphabetic order. This is for convenience only. A writer SHOULD implement 

this if possible. A reader MUST NOT depend on this. 

 

Following the header comes the array data. If the dtype contains Python 

objects (i.e. ``dtype.hasobject is True``), then the data is a Python 

pickle of the array. Otherwise the data is the contiguous (either C- 

or Fortran-, depending on ``fortran_order``) bytes of the array. 

Consumers can figure out the number of bytes by multiplying the number 

of elements given by the shape (noting that ``shape=()`` means there is 

1 element) by ``dtype.itemsize``. 

 

Format Version 2.0 

------------------ 

 

The version 1.0 format only allowed the array header to have a total size of 

65535 bytes. This can be exceeded by structured arrays with a large number of 

columns. The version 2.0 format extends the header size to 4 GiB. 

`numpy.save` will automatically save in 2.0 format if the data requires it, 

else it will always use the more compatible 1.0 format. 

 

The description of the fourth element of the header therefore has become: 

"The next 4 bytes form a little-endian unsigned int: the length of the header 

data HEADER_LEN." 

 

Notes 

----- 

The ``.npy`` format, including motivation for creating it and a comparison of 

alternatives, is described in the `"npy-format" NEP  

<https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have 

evolved with time and this document is more current. 

 

""" 

from __future__ import division, absolute_import, print_function 

 

import numpy 

import sys 

import io 

import warnings 

from numpy.lib.utils import safe_eval 

from numpy.compat import ( 

asbytes, asstr, isfileobj, long, os_fspath 

) 

from numpy.core.numeric import pickle 

 

 

MAGIC_PREFIX = b'\x93NUMPY' 

MAGIC_LEN = len(MAGIC_PREFIX) + 2 

ARRAY_ALIGN = 64 # plausible values are powers of 2 between 16 and 4096 

BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes 

 

# difference between version 1.0 and 2.0 is a 4 byte (I) header length 

# instead of 2 bytes (H) allowing storage of large structured arrays 

 

def _check_version(version): 

if version not in [(1, 0), (2, 0), None]: 

msg = "we only support format version (1,0) and (2, 0), not %s" 

raise ValueError(msg % (version,)) 

 

def magic(major, minor): 

""" Return the magic string for the given file format version. 

 

Parameters 

---------- 

major : int in [0, 255] 

minor : int in [0, 255] 

 

Returns 

------- 

magic : str 

 

Raises 

------ 

ValueError if the version cannot be formatted. 

""" 

if major < 0 or major > 255: 

raise ValueError("major version must be 0 <= major < 256") 

if minor < 0 or minor > 255: 

raise ValueError("minor version must be 0 <= minor < 256") 

if sys.version_info[0] < 3: 

return MAGIC_PREFIX + chr(major) + chr(minor) 

else: 

return MAGIC_PREFIX + bytes([major, minor]) 

 

def read_magic(fp): 

""" Read the magic string to get the version of the file format. 

 

Parameters 

---------- 

fp : filelike object 

 

Returns 

------- 

major : int 

minor : int 

""" 

magic_str = _read_bytes(fp, MAGIC_LEN, "magic string") 

if magic_str[:-2] != MAGIC_PREFIX: 

msg = "the magic string is not correct; expected %r, got %r" 

raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2])) 

if sys.version_info[0] < 3: 

major, minor = map(ord, magic_str[-2:]) 

else: 

major, minor = magic_str[-2:] 

return major, minor 

 

def dtype_to_descr(dtype): 

""" 

Get a serializable descriptor from the dtype. 

 

The .descr attribute of a dtype object cannot be round-tripped through 

the dtype() constructor. Simple types, like dtype('float32'), have 

a descr which looks like a record array with one field with '' as 

a name. The dtype() constructor interprets this as a request to give 

a default name. Instead, we construct descriptor that can be passed to 

dtype(). 

 

Parameters 

---------- 

dtype : dtype 

The dtype of the array that will be written to disk. 

 

Returns 

------- 

descr : object 

An object that can be passed to `numpy.dtype()` in order to 

replicate the input dtype. 

 

""" 

if dtype.names is not None: 

# This is a record array. The .descr is fine. XXX: parts of the 

# record array with an empty name, like padding bytes, still get 

# fiddled with. This needs to be fixed in the C implementation of 

# dtype(). 

return dtype.descr 

else: 

return dtype.str 

 

def descr_to_dtype(descr): 

''' 

descr may be stored as dtype.descr, which is a list of 

(name, format, [shape]) tuples. Offsets are not explicitly saved, rather 

empty fields with name,format == '', '|Vn' are added as padding. 

 

This function reverses the process, eliminating the empty padding fields. 

''' 

if isinstance(descr, (str, dict)): 

# No padding removal needed 

return numpy.dtype(descr) 

 

fields = [] 

offset = 0 

for field in descr: 

if len(field) == 2: 

name, descr_str = field 

dt = descr_to_dtype(descr_str) 

else: 

name, descr_str, shape = field 

dt = numpy.dtype((descr_to_dtype(descr_str), shape)) 

 

# Ignore padding bytes, which will be void bytes with '' as name 

# Once support for blank names is removed, only "if name == ''" needed) 

is_pad = (name == '' and dt.type is numpy.void and dt.names is None) 

if not is_pad: 

fields.append((name, dt, offset)) 

 

offset += dt.itemsize 

 

names, formats, offsets = zip(*fields) 

# names may be (title, names) tuples 

nametups = (n if isinstance(n, tuple) else (None, n) for n in names) 

titles, names = zip(*nametups) 

return numpy.dtype({'names': names, 'formats': formats, 'titles': titles, 

'offsets': offsets, 'itemsize': offset}) 

 

def header_data_from_array_1_0(array): 

""" Get the dictionary of header metadata from a numpy.ndarray. 

 

Parameters 

---------- 

array : numpy.ndarray 

 

Returns 

------- 

d : dict 

This has the appropriate entries for writing its string representation 

to the header of the file. 

""" 

d = {'shape': array.shape} 

if array.flags.c_contiguous: 

d['fortran_order'] = False 

elif array.flags.f_contiguous: 

d['fortran_order'] = True 

else: 

# Totally non-contiguous data. We will have to make it C-contiguous 

# before writing. Note that we need to test for C_CONTIGUOUS first 

# because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS. 

d['fortran_order'] = False 

 

d['descr'] = dtype_to_descr(array.dtype) 

return d 

 

def _write_array_header(fp, d, version=None): 

""" Write the header for an array and returns the version used 

 

Parameters 

---------- 

fp : filelike object 

d : dict 

This has the appropriate entries for writing its string representation 

to the header of the file. 

version: tuple or None 

None means use oldest that works 

explicit version will raise a ValueError if the format does not 

allow saving this data. Default: None 

Returns 

------- 

version : tuple of int 

the file version which needs to be used to store the data 

""" 

import struct 

header = ["{"] 

for key, value in sorted(d.items()): 

# Need to use repr here, since we eval these when reading 

header.append("'%s': %s, " % (key, repr(value))) 

header.append("}") 

header = "".join(header) 

header = asbytes(_filter_header(header)) 

 

hlen = len(header) + 1 # 1 for newline 

padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN) 

padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN) 

 

# Which version(s) we write depends on the total header size; v1 has a max of 65535 

if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)): 

version = (1, 0) 

header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1) 

topad = padlen_v1 

elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)): 

version = (2, 0) 

header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2) 

topad = padlen_v2 

else: 

msg = "Header length %s too big for version=%s" 

msg %= (hlen, version) 

raise ValueError(msg) 

 

# Pad the header with spaces and a final newline such that the magic 

# string, the header-length short and the header are aligned on a 

# ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes 

# aligned up to ARRAY_ALIGN on systems like Linux where mmap() 

# offset must be page-aligned (i.e. the beginning of the file). 

header = header + b' '*topad + b'\n' 

 

fp.write(header_prefix) 

fp.write(header) 

return version 

 

def write_array_header_1_0(fp, d): 

""" Write the header for an array using the 1.0 format. 

 

Parameters 

---------- 

fp : filelike object 

d : dict 

This has the appropriate entries for writing its string 

representation to the header of the file. 

""" 

_write_array_header(fp, d, (1, 0)) 

 

 

def write_array_header_2_0(fp, d): 

""" Write the header for an array using the 2.0 format. 

The 2.0 format allows storing very large structured arrays. 

 

.. versionadded:: 1.9.0 

 

Parameters 

---------- 

fp : filelike object 

d : dict 

This has the appropriate entries for writing its string 

representation to the header of the file. 

""" 

_write_array_header(fp, d, (2, 0)) 

 

def read_array_header_1_0(fp): 

""" 

Read an array header from a filelike object using the 1.0 file format 

version. 

 

This will leave the file object located just after the header. 

 

Parameters 

---------- 

fp : filelike object 

A file object or something with a `.read()` method like a file. 

 

Returns 

------- 

shape : tuple of int 

The shape of the array. 

fortran_order : bool 

The array data will be written out directly if it is either 

C-contiguous or Fortran-contiguous. Otherwise, it will be made 

contiguous before writing it out. 

dtype : dtype 

The dtype of the file's data. 

 

Raises 

------ 

ValueError 

If the data is invalid. 

 

""" 

return _read_array_header(fp, version=(1, 0)) 

 

def read_array_header_2_0(fp): 

""" 

Read an array header from a filelike object using the 2.0 file format 

version. 

 

This will leave the file object located just after the header. 

 

.. versionadded:: 1.9.0 

 

Parameters 

---------- 

fp : filelike object 

A file object or something with a `.read()` method like a file. 

 

Returns 

------- 

shape : tuple of int 

The shape of the array. 

fortran_order : bool 

The array data will be written out directly if it is either 

C-contiguous or Fortran-contiguous. Otherwise, it will be made 

contiguous before writing it out. 

dtype : dtype 

The dtype of the file's data. 

 

Raises 

------ 

ValueError 

If the data is invalid. 

 

""" 

return _read_array_header(fp, version=(2, 0)) 

 

 

def _filter_header(s): 

"""Clean up 'L' in npz header ints. 

 

Cleans up the 'L' in strings representing integers. Needed to allow npz 

headers produced in Python2 to be read in Python3. 

 

Parameters 

---------- 

s : byte string 

Npy file header. 

 

Returns 

------- 

header : str 

Cleaned up header. 

 

""" 

import tokenize 

if sys.version_info[0] >= 3: 

from io import StringIO 

else: 

from StringIO import StringIO 

 

tokens = [] 

last_token_was_number = False 

# adding newline as python 2.7.5 workaround 

string = asstr(s) + "\n" 

for token in tokenize.generate_tokens(StringIO(string).readline): 

token_type = token[0] 

token_string = token[1] 

if (last_token_was_number and 

token_type == tokenize.NAME and 

token_string == "L"): 

continue 

else: 

tokens.append(token) 

last_token_was_number = (token_type == tokenize.NUMBER) 

# removing newline (see above) as python 2.7.5 workaround 

return tokenize.untokenize(tokens)[:-1] 

 

 

def _read_array_header(fp, version): 

""" 

see read_array_header_1_0 

""" 

# Read an unsigned, little-endian short int which has the length of the 

# header. 

import struct 

if version == (1, 0): 

hlength_type = '<H' 

elif version == (2, 0): 

hlength_type = '<I' 

else: 

raise ValueError("Invalid version %r" % version) 

 

hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length") 

header_length = struct.unpack(hlength_type, hlength_str)[0] 

header = _read_bytes(fp, header_length, "array header") 

 

# The header is a pretty-printed string representation of a literal 

# Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte 

# boundary. The keys are strings. 

# "shape" : tuple of int 

# "fortran_order" : bool 

# "descr" : dtype.descr 

header = _filter_header(header) 

try: 

d = safe_eval(header) 

except SyntaxError as e: 

msg = "Cannot parse header: %r\nException: %r" 

raise ValueError(msg % (header, e)) 

if not isinstance(d, dict): 

msg = "Header is not a dictionary: %r" 

raise ValueError(msg % d) 

keys = sorted(d.keys()) 

if keys != ['descr', 'fortran_order', 'shape']: 

msg = "Header does not contain the correct keys: %r" 

raise ValueError(msg % (keys,)) 

 

# Sanity-check the values. 

if (not isinstance(d['shape'], tuple) or 

not numpy.all([isinstance(x, (int, long)) for x in d['shape']])): 

msg = "shape is not valid: %r" 

raise ValueError(msg % (d['shape'],)) 

if not isinstance(d['fortran_order'], bool): 

msg = "fortran_order is not a valid bool: %r" 

raise ValueError(msg % (d['fortran_order'],)) 

try: 

dtype = descr_to_dtype(d['descr']) 

except TypeError as e: 

msg = "descr is not a valid dtype descriptor: %r" 

raise ValueError(msg % (d['descr'],)) 

 

return d['shape'], d['fortran_order'], dtype 

 

def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): 

""" 

Write an array to an NPY file, including a header. 

 

If the array is neither C-contiguous nor Fortran-contiguous AND the 

file_like object is not a real file object, this function will have to 

copy data in memory. 

 

Parameters 

---------- 

fp : file_like object 

An open, writable file object, or similar object with a 

``.write()`` method. 

array : ndarray 

The array to write to disk. 

version : (int, int) or None, optional 

The version number of the format. None means use the oldest 

supported version that is able to store the data. Default: None 

allow_pickle : bool, optional 

Whether to allow writing pickled data. Default: True 

pickle_kwargs : dict, optional 

Additional keyword arguments to pass to pickle.dump, excluding 

'protocol'. These are only useful when pickling objects in object 

arrays on Python 3 to Python 2 compatible format. 

 

Raises 

------ 

ValueError 

If the array cannot be persisted. This includes the case of 

allow_pickle=False and array being an object array. 

Various other errors 

If the array contains Python objects as part of its dtype, the 

process of pickling them may raise various errors if the objects 

are not picklable. 

 

""" 

_check_version(version) 

used_ver = _write_array_header(fp, header_data_from_array_1_0(array), 

version) 

# this warning can be removed when 1.9 has aged enough 

if version != (2, 0) and used_ver == (2, 0): 

warnings.warn("Stored array in format 2.0. It can only be" 

"read by NumPy >= 1.9", UserWarning, stacklevel=2) 

 

if array.itemsize == 0: 

buffersize = 0 

else: 

# Set buffer size to 16 MiB to hide the Python loop overhead. 

buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) 

 

if array.dtype.hasobject: 

# We contain Python objects so we cannot write out the data 

# directly. Instead, we will pickle it out with version 2 of the 

# pickle protocol. 

if not allow_pickle: 

raise ValueError("Object arrays cannot be saved when " 

"allow_pickle=False") 

if pickle_kwargs is None: 

pickle_kwargs = {} 

pickle.dump(array, fp, protocol=2, **pickle_kwargs) 

elif array.flags.f_contiguous and not array.flags.c_contiguous: 

if isfileobj(fp): 

array.T.tofile(fp) 

else: 

for chunk in numpy.nditer( 

array, flags=['external_loop', 'buffered', 'zerosize_ok'], 

buffersize=buffersize, order='F'): 

fp.write(chunk.tobytes('C')) 

else: 

if isfileobj(fp): 

array.tofile(fp) 

else: 

for chunk in numpy.nditer( 

array, flags=['external_loop', 'buffered', 'zerosize_ok'], 

buffersize=buffersize, order='C'): 

fp.write(chunk.tobytes('C')) 

 

 

def read_array(fp, allow_pickle=True, pickle_kwargs=None): 

""" 

Read an array from an NPY file. 

 

Parameters 

---------- 

fp : file_like object 

If this is not a real file object, then this may take extra memory 

and time. 

allow_pickle : bool, optional 

Whether to allow reading pickled data. Default: True 

pickle_kwargs : dict 

Additional keyword arguments to pass to pickle.load. These are only 

useful when loading object arrays saved on Python 2 when using 

Python 3. 

 

Returns 

------- 

array : ndarray 

The array from the data on disk. 

 

Raises 

------ 

ValueError 

If the data is invalid, or allow_pickle=False and the file contains 

an object array. 

 

""" 

version = read_magic(fp) 

_check_version(version) 

shape, fortran_order, dtype = _read_array_header(fp, version) 

if len(shape) == 0: 

count = 1 

else: 

count = numpy.multiply.reduce(shape, dtype=numpy.int64) 

 

# Now read the actual data. 

if dtype.hasobject: 

# The array contained Python objects. We need to unpickle the data. 

if not allow_pickle: 

raise ValueError("Object arrays cannot be loaded when " 

"allow_pickle=False") 

if pickle_kwargs is None: 

pickle_kwargs = {} 

try: 

array = pickle.load(fp, **pickle_kwargs) 

except UnicodeError as err: 

if sys.version_info[0] >= 3: 

# Friendlier error message 

raise UnicodeError("Unpickling a python object failed: %r\n" 

"You may need to pass the encoding= option " 

"to numpy.load" % (err,)) 

raise 

else: 

if isfileobj(fp): 

# We can use the fast fromfile() function. 

array = numpy.fromfile(fp, dtype=dtype, count=count) 

else: 

# This is not a real file. We have to read it the 

# memory-intensive way. 

# crc32 module fails on reads greater than 2 ** 32 bytes, 

# breaking large reads from gzip streams. Chunk reads to 

# BUFFER_SIZE bytes to avoid issue and reduce memory overhead 

# of the read. In non-chunked case count < max_read_count, so 

# only one read is performed. 

 

# Use np.ndarray instead of np.empty since the latter does 

# not correctly instantiate zero-width string dtypes; see 

# https://github.com/numpy/numpy/pull/6430 

array = numpy.ndarray(count, dtype=dtype) 

 

if dtype.itemsize > 0: 

# If dtype.itemsize == 0 then there's nothing more to read 

max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize) 

 

for i in range(0, count, max_read_count): 

read_count = min(max_read_count, count - i) 

read_size = int(read_count * dtype.itemsize) 

data = _read_bytes(fp, read_size, "array data") 

array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype, 

count=read_count) 

 

if fortran_order: 

array.shape = shape[::-1] 

array = array.transpose() 

else: 

array.shape = shape 

 

return array 

 

 

def open_memmap(filename, mode='r+', dtype=None, shape=None, 

fortran_order=False, version=None): 

""" 

Open a .npy file as a memory-mapped array. 

 

This may be used to read an existing file or create a new one. 

 

Parameters 

---------- 

filename : str or path-like 

The name of the file on disk. This may *not* be a file-like 

object. 

mode : str, optional 

The mode in which to open the file; the default is 'r+'. In 

addition to the standard file modes, 'c' is also accepted to mean 

"copy on write." See `memmap` for the available mode strings. 

dtype : data-type, optional 

The data type of the array if we are creating a new file in "write" 

mode, if not, `dtype` is ignored. The default value is None, which 

results in a data-type of `float64`. 

shape : tuple of int 

The shape of the array if we are creating a new file in "write" 

mode, in which case this parameter is required. Otherwise, this 

parameter is ignored and is thus optional. 

fortran_order : bool, optional 

Whether the array should be Fortran-contiguous (True) or 

C-contiguous (False, the default) if we are creating a new file in 

"write" mode. 

version : tuple of int (major, minor) or None 

If the mode is a "write" mode, then this is the version of the file 

format used to create the file. None means use the oldest 

supported version that is able to store the data. Default: None 

 

Returns 

------- 

marray : memmap 

The memory-mapped array. 

 

Raises 

------ 

ValueError 

If the data or the mode is invalid. 

IOError 

If the file is not found or cannot be opened correctly. 

 

See Also 

-------- 

memmap 

 

""" 

if isfileobj(filename): 

raise ValueError("Filename must be a string or a path-like object." 

" Memmap cannot use existing file handles.") 

 

if 'w' in mode: 

# We are creating the file, not reading it. 

# Check if we ought to create the file. 

_check_version(version) 

# Ensure that the given dtype is an authentic dtype object rather 

# than just something that can be interpreted as a dtype object. 

dtype = numpy.dtype(dtype) 

if dtype.hasobject: 

msg = "Array can't be memory-mapped: Python objects in dtype." 

raise ValueError(msg) 

d = dict( 

descr=dtype_to_descr(dtype), 

fortran_order=fortran_order, 

shape=shape, 

) 

# If we got here, then it should be safe to create the file. 

fp = open(os_fspath(filename), mode+'b') 

try: 

used_ver = _write_array_header(fp, d, version) 

# this warning can be removed when 1.9 has aged enough 

if version != (2, 0) and used_ver == (2, 0): 

warnings.warn("Stored array in format 2.0. It can only be" 

"read by NumPy >= 1.9", UserWarning, stacklevel=2) 

offset = fp.tell() 

finally: 

fp.close() 

else: 

# Read the header of the file first. 

fp = open(os_fspath(filename), 'rb') 

try: 

version = read_magic(fp) 

_check_version(version) 

 

shape, fortran_order, dtype = _read_array_header(fp, version) 

if dtype.hasobject: 

msg = "Array can't be memory-mapped: Python objects in dtype." 

raise ValueError(msg) 

offset = fp.tell() 

finally: 

fp.close() 

 

if fortran_order: 

order = 'F' 

else: 

order = 'C' 

 

# We need to change a write-only mode to a read-write mode since we've 

# already written data to the file. 

if mode == 'w+': 

mode = 'r+' 

 

marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order, 

mode=mode, offset=offset) 

 

return marray 

 

 

def _read_bytes(fp, size, error_template="ran out of data"): 

""" 

Read from file-like object until size bytes are read. 

Raises ValueError if not EOF is encountered before size bytes are read. 

Non-blocking objects only supported if they derive from io objects. 

 

Required as e.g. ZipExtFile in python 2.6 can return less data than 

requested. 

""" 

data = bytes() 

while True: 

# io files (default in python3) return None or raise on 

# would-block, python2 file will truncate, probably nothing can be 

# done about that. note that regular files can't be non-blocking 

try: 

r = fp.read(size - len(data)) 

data += r 

if len(r) == 0 or len(data) == size: 

break 

except io.BlockingIOError: 

pass 

if len(data) != size: 

msg = "EOF: reading %s, expected %d bytes got %d" 

raise ValueError(msg % (error_template, size, len(data))) 

else: 

return data