Skip to content

Commit 700bdf3

Browse files
Merge pull request #160 from libffcv/v1.0.0
V1.0.0
2 parents b865918 + 7cd3442 commit 700bdf3

29 files changed

Lines changed: 1114 additions & 298 deletions

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,29 @@ Keep your training algorithm the same, just replace the data loader! Look at the
3434
<img src="docs/_static/perf_scatterplot.svg" width='830px'/>
3535

3636
## Installation
37+
### Linux
3738
```
3839
conda create -y -n ffcv python=3.9 cupy pkg-config compilers libjpeg-turbo opencv pytorch torchvision cudatoolkit=11.3 numba -c pytorch -c conda-forge
3940
conda activate ffcv
4041
pip install ffcv
4142
```
4243
Troubleshooting note: if the above commands result in a package conflict error, try running ``conda config --env --set channel_priority flexible`` in the environment and rerunning the installation command.
4344

45+
### Windows
46+
* Install <a href="https://opencv.org/releases/">opencv4</a>
47+
* Add `..../opencv/build/x64/vc15/bin` to PATH environment variable
48+
* Install <a href="https://sourceforge.net/projects/libjpeg-turbo/files/">libjpeg-turbo</a>, download libjpeg-turbo-x.x.x-vc64.exe, not gcc64
49+
* Add `..../libjpeg-turbo64/bin` to PATH environment variable
50+
* Install <a href="https://www.sourceware.org/pthreads-win32/">pthread</a>, download last release.zip
51+
* After unzip, rename Pre-build.2 folder to pthread
52+
* Open `pthread/include/pthread.h`, and add the code below to the top of the file.
53+
```cpp
54+
#define HAVE_STRUCT_TIMESPEC
55+
```
56+
* Add `..../pthread/dll` to PATH environment variable
57+
* Install <a href="https://docs.cupy.dev/en/stable/install.html#installing-cupy">cupy</a> depending on your CUDA Toolkit version.
58+
* `pip install ffcv`
59+
4460
## Citation
4561
If you use FFCV, please cite it as:
4662

ffcv/.DS_Store

6 KB
Binary file not shown.

ffcv/fields/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from .basics import FloatField, IntField
33
from .rgb_image import RGBImageField
44
from .bytes import BytesField
5-
from .ndarray import NDArrayField
5+
from .ndarray import NDArrayField, TorchTensorField
66
from .json import JSONField
77

88
__all__ = ['Field', 'BytesField', 'IntField', 'FloatField', 'RGBImageField',
9-
'NDArrayField', 'JSONField']
9+
'NDArrayField', 'JSONField', 'TorchTensorField']

ffcv/fields/ndarray.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from typing import Callable, TYPE_CHECKING, Tuple, Type
2+
import warnings
23
import json
34
from dataclasses import replace
45

56
import numpy as np
7+
import torch as ch
68

79
from .base import Field, ARG_TYPE
810
from ..pipeline.operation import Operation
@@ -55,6 +57,10 @@ def __init__(self, dtype:np.dtype, shape:Tuple[int, ...]):
5557
self.dtype = dtype
5658
self.shape = shape
5759
self.element_size = dtype.itemsize * np.prod(shape)
60+
if dtype == np.uint16:
61+
warnings.warn("Pytorch currently doesn't support uint16"
62+
"we recommend storing as int16 and reinterpret your data later"
63+
"in your pipeline")
5864

5965
@property
6066
def metadata_type(self) -> np.dtype:
@@ -93,4 +99,21 @@ def encode(self, destination, field, malloc):
9399
data_region[:] = field.reshape(-1).view('<u1')
94100

95101
def get_decoder_class(self) -> Type[Operation]:
96-
return NDArrayDecoder
102+
return NDArrayDecoder
103+
104+
105+
class TorchTensorField(NDArrayField):
106+
"""A subclass of :class:`~ffcv.fields.Field` supporting
107+
multi-dimensional fixed size matrices of any torch type.
108+
"""
109+
def __init__(self, dtype:ch.dtype, shape:Tuple[int, ...]):
110+
self.dtype = dtype
111+
self.shape = shape
112+
dtype = ch.zeros(0, dtype=dtype).numpy().dtype
113+
114+
super().__init__(dtype, shape)
115+
116+
117+
def encode(self, destination, field, malloc):
118+
field = field.numpy()
119+
return super().encode(destination, field, malloc)

ffcv/libffcv.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
import ctypes
22
from numba import njit
33
import numpy as np
4+
import platform
45
from ctypes import CDLL, c_int64, c_uint8, c_uint64, POINTER, c_void_p, c_uint32, c_bool, cdll
56
import ffcv._libffcv
67

78
lib = CDLL(ffcv._libffcv.__file__)
8-
libc = cdll.LoadLibrary('libc.so.6')
9+
if platform.system() == "Windows":
10+
libc = cdll.msvcrt
11+
read_c = libc._read
12+
else:
13+
libc = cdll.LoadLibrary('libc.so.6')
14+
read_c = libc.pread
915

10-
read_c = libc.pread
1116
read_c.argtypes = [c_uint32, c_void_p, c_uint64, c_uint64]
1217

1318
def read(fileno:int, destination:np.ndarray, offset:int):
@@ -47,5 +52,5 @@ def imdecode(source: np.ndarray, dst: np.ndarray,
4752
ctypes_memcopy.argtypes = [c_void_p, c_void_p, c_uint64]
4853

4954
def memcpy(source: np.ndarray, dest: np.ndarray):
50-
return ctypes_memcopy(source.ctypes.data, dest.ctypes.data, source.size)
55+
return ctypes_memcopy(source.ctypes.data, dest.ctypes.data, source.size*source.itemsize)
5156

ffcv/loader/epoch_iterator.py

Lines changed: 50 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@
1919
(`OrderOption.QUASI_RANDOM`) in the dataloader constructor's `order` argument.
2020
'''
2121

22+
def select_buffer(buffer, batch_slot, count):
23+
"""Util function to select the relevent subpart of a buffer for a given
24+
batch_slot and batch size"""
25+
if buffer is None:
26+
return None
27+
if isinstance(buffer, tuple):
28+
return tuple(select_buffer(x, batch_slot, count) for x in buffer)
29+
30+
return buffer[batch_slot][:count]
31+
32+
2233
class EpochIterator(Thread):
2334
def __init__(self, loader: 'Loader', order: Sequence[int]):
2435
super().__init__(daemon=True)
@@ -33,6 +44,10 @@ def __init__(self, loader: 'Loader', order: Sequence[int]):
3344
self.terminate_event = Event()
3445
self.memory_context = self.loader.memory_manager.schedule_epoch(
3546
batches)
47+
48+
if IS_CUDA:
49+
self.current_stream = ch.cuda.current_stream()
50+
3651
try:
3752
self.memory_context.__enter__()
3853
except MemoryError as e:
@@ -44,23 +59,13 @@ def __init__(self, loader: 'Loader', order: Sequence[int]):
4459

4560
self.storage_state = self.memory_context.state
4661

47-
self.memory_bank_per_stage = defaultdict(list)
48-
4962
self.cuda_streams = [(ch.cuda.Stream() if IS_CUDA else None)
5063
for _ in range(self.loader.batches_ahead + 2)]
5164

52-
# Allocate all the memory
53-
memory_allocations = {}
54-
for (p_id, p) in self.loader.pipelines.items():
55-
memory_allocations[p_id] = p.allocate_memory(self.loader.batch_size,
56-
self.loader.batches_ahead + 2)
57-
58-
# Assign each memory bank to the pipeline stage it belongs to
59-
for s_ix, banks in self.loader.memory_bank_keys_per_stage.items():
60-
for (pipeline_name, op_id) in banks:
61-
self.memory_bank_per_stage[s_ix].append(
62-
memory_allocations[pipeline_name][op_id]
63-
)
65+
self.memory_allocations = self.loader.graph.allocate_memory(
66+
self.loader.batch_size,
67+
self.loader.batches_ahead + 2
68+
)
6469

6570
self.start()
6671

@@ -77,6 +82,7 @@ def run(self):
7782
self.current_batch_slot = (
7883
slot + 1) % (self.loader.batches_ahead + 2)
7984
result = self.run_pipeline(b_ix, ixes, slot, events[slot])
85+
# print("RES", b_ix, "ready")
8086
to_output = (slot, result)
8187
while True:
8288
try:
@@ -88,23 +94,24 @@ def run(self):
8894
if self.terminate_event.is_set():
8995
return
9096
if IS_CUDA:
97+
# print("SUB", b_ix)
9198
# We were able to submit this batch
9299
# Therefore it means that the user must have entered the for loop for
93100
# (batch_slot - batch_ahead + 1) % (batches ahead + 2)
94101
# Therefore batch_slot - batch_ahead must have all it's work submitted
95102
# We will record an event of all the work submitted on the main stream
96103
# and make sure no one overwrite the data until they are done
97-
just_finished_slot = (slot - self.loader.batches_ahead) % (self.loader.batches_ahead + 2)
104+
just_finished_slot = (slot - self.loader.batches_ahead - 1) % (self.loader.batches_ahead + 2)
105+
# print("JFS", just_finished_slot)
98106
event = ch.cuda.Event()
99-
event.record(ch.cuda.default_stream())
107+
event.record(self.current_stream)
100108
events[just_finished_slot] = event
101109
b_ix += 1
102110

103111
except StopIteration:
104112
self.output_queue.put(None)
105113

106114
def run_pipeline(self, b_ix, batch_indices, batch_slot, cuda_event):
107-
# print(b_ix, batch_indices)
108115
self.memory_context.start_batch(b_ix)
109116
args = []
110117
if IS_CUDA:
@@ -114,28 +121,35 @@ def run_pipeline(self, b_ix, batch_indices, batch_slot, cuda_event):
114121
ctx = nullcontext()
115122
first_stage = False
116123

124+
125+
code, outputs = self.loader.code
117126
with ctx:
118127
if IS_CUDA:
119128
if cuda_event:
120129
cuda_event.wait()
121-
for stage, banks in self.memory_bank_per_stage.items():
122-
args.insert(0, batch_indices)
123-
for bank in banks:
124-
if bank is not None:
125-
if isinstance(bank, tuple):
126-
bank = tuple(x[batch_slot] for x in bank)
127-
else:
128-
bank = bank[batch_slot]
129-
args.append(bank)
130-
args.append(self.metadata)
131-
args.append(self.storage_state)
132-
code = self.loader.code_per_stage[stage]
133-
result = code(*args)
134-
args = list(result)
135-
if first_stage:
136-
first_stage = False
137-
self.memory_context.end_batch(b_ix)
138-
return tuple(x[:len(batch_indices)] for x in args)
130+
131+
args = {
132+
'batch_indices': batch_indices,
133+
'storage_state': self.storage_state,
134+
'metadata': self.metadata,
135+
**{
136+
f'memory_{k}':select_buffer(v, batch_slot, len(batch_indices))
137+
for (k, v) in self.memory_allocations['operation'].items()
138+
},
139+
**{
140+
f'shared_memory_{k}': select_buffer(v, batch_slot, len(batch_indices))
141+
for (k, v) in self.memory_allocations['shared'].items()
142+
}
143+
}
144+
145+
for stage_code, define_outputs in code:
146+
results = stage_code(**args)
147+
for node_id, result in zip(define_outputs, results):
148+
args[f'result_{node_id}'] = result
149+
pass
150+
151+
result = tuple(args[f'result_{x}'] for x in outputs)
152+
return result
139153

140154
def __next__(self):
141155
result = self.output_queue.get()
@@ -146,7 +160,7 @@ def __next__(self):
146160
if IS_CUDA:
147161
stream = self.cuda_streams[slot]
148162
# We wait for the copy to be done
149-
ch.cuda.current_stream().wait_stream(stream)
163+
self.current_stream.wait_stream(stream)
150164
return result
151165

152166
def __iter__(self):

0 commit comments

Comments
 (0)