跳转至

自定义数据集


MindSpore 1.5.0Python 3.7.5CUDA 11.1,使用docker pull swr.cn-south-1.myhuaweicloud.com/mindspore/mindspore-gpu-cuda11.1:1.5.0镜像。


最近有关于多模态学习的项目,使用MindSpore进行开发,首先需要自定义数据。

MindSpore中数据的定义通过dataset API定义,其中有一系列内置的数据集解决方案,在本文撰写(2021.12.29)的时候支持了CelebACifar10Cifar100COCOImageFolderMnistVOC(以上为计算机视觉)、CLUE(文本)、Graph(图神经网络),还有CSVText等文本格式,以及ManifestTFRecordMindRecord等预处理过的结构化格式。

但是,问题是,我们现在需要在一个全新的数据集上很快地跑出结果,这就需要自定义数据集

最简单的自定义数据集方式是GeneratorDataset。顾名思义,其实现了Pythongeneratoriteratable接口,可以作为迭代的对象。

class mindspore.dataset.GeneratorDataset(
    source,
    column_names=None,
    column_types=None,
    schema=None,
    num_samples=None,
    num_parallel_workers=1,
    shuffle=None,
    sampler=None,
    num_shards=None,
    shard_id=None,
    python_multiprocessing=True,
    max_rowsize=6
)

我们目前最关注的是sourcecolumn_names两个参数,source是一个可迭代对象,应该是一个(generator | iteratable | random_accessable)对象。概括来说,这个对象应该该每次访问时返回一个tupletuple中的每个元素都必须是np.array对象或tuplelisttuple可嵌套,但最终对象一定要是np.array。具体来说,有四种方法定义这个对象

import numpy as np

# 1) genarator callable: 单列数据,np.array,包装在tuple里
def generator_multidimensional():
    for i in range(64):
        yield (np.array([i, i + 1], [i + 2, i + 3](i, i + 1], [i + 2, i + 3.md){#a124bbfe1174c78da876b4584a1bd434}),)

dataset = ds.GeneratorDataset(source=generator_multidimensional, column_names=["multi_dimensional_data"])

# 2) generator callable: 多列数据,np.array,包装在tuple
def generator_multi_column():
    for i in range(64):
        yield np.array([i]), np.array([i, i + 1], [i + 2, i + 3](i, i + 1], [i + 2, i + 3.md){#a124bbfe1174c78da876b4584a1bd434})

dataset = ds.GeneratorDataset(source=generator_multi_column, column_names=["col1", "col2"])

# 3) 可迭代对象(实现了__next__、__iter__、__len__方法)
class MyIterable:
    def __init__(self):
        self._index = 0
        self._data = np.random.sample((5, 2))
        self._label = np.random.sample((5, 1))

    def __next__(self):
        if self._index >= len(self._data):
            raise StopIteration
        else:
            item = (self._data[self._index], self._label[self._index])
            self._index += 1
            return item

    def __iter__(self):
        self._index = 0
        return self

    def __len__(self):
        return len(self._data)

dataset = ds.GeneratorDataset(source=MyIterable(), column_names=["data", "label"])

# 4) 可随机访问对象(实现了__getitem__和__len__方法)
class MyAccessible:
    def __init__(self):
        self._data = np.random.sample((5, 2))
        self._label = np.random.sample((5, 1))

    def __getitem__(self, index):
        return self._data[index], self._label[index]

    def __len__(self):
        return len(self._data)

dataset = ds.GeneratorDataset(source=MyAccessible(), column_names=["data", "label"])

# 5) Python原生对象(可以是list、tuple或generator),但内部**必须是**tuple包装的一或多个np.array
dataset = ds.GeneratorDataset(source=[(np.array(0),), (np.array(1),), (np.array(2),)], column_names=["col"])

其中,第四种方法理解较为简单,在各框架中通用性高,较为推荐。

使用方法:

#%% In[1]
import mindspore as ms
from mindspore.dataset import GeneratorDataset
import numpy as np

class RandomScalarDataset():
    def __init__(self, size=4):
        self._data = np.random.sample((size, 2))
        self._label = np.random.sample((size, 1)).astype(np.uint64)

    def __getitem__(self, index):
        # return self._data[index], self._label[index]
        return self._data[index], self._label[index]

    def __len__(self):
        return len(self._data)

if __name__ == '__main__':
    np.random.seed(42)
    dataset = RandomScalarDataset()
    generator_dataset = GeneratorDataset(source=dataset, column_names=['scalar', 'target'])
    it = generator_dataset.create_tuple_iterator()
    for i in it: print(i)

    it = generator_dataset.create_dict_iterator()
    for i in it: print(i)

预期输出:

[Tensor(shape=[2], dtype=Float64, value= [ 5.80836122e-02,  8.66176146e-01]), Tensor(shape=[1], dtype=UInt64, value= [0])]
[Tensor(shape=[2], dtype=Float64, value= [ 3.74540119e-01,  9.50714306e-01]), Tensor(shape=[1], dtype=UInt64, value= [0])]
[Tensor(shape=[2], dtype=Float64, value= [ 7.31993942e-01,  5.98658484e-01]), Tensor(shape=[1], dtype=UInt64, value= [0])]
[Tensor(shape=[2], dtype=Float64, value= [ 1.56018640e-01,  1.55994520e-01]), Tensor(shape=[1], dtype=UInt64, value= [0])]
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 1.56018640e-01,  1.55994520e-01]), 'target': Tensor(shape=[1], dtype=UInt64, value= [0])}
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 7.31993942e-01,  5.98658484e-01]), 'target': Tensor(shape=[1], dtype=UInt64, value= [0])}
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 3.74540119e-01,  9.50714306e-01]), 'target': Tensor(shape=[1], dtype=UInt64, value= [0])}
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 5.80836122e-02,  8.66176146e-01]), 'target': Tensor(shape=[1], dtype=UInt64, value= [0])}

测试 2(tuple 嵌套)

import mindspore as ms
from mindspore.dataset import GeneratorDataset
import numpy as np

class RandomScalarDataset():
    def __init__(self, size=4):
        self._data = (np.random.sample((size, 2)))
        self._label = np.random.randint(0, 42, (size, 1)).astype(np.uint64)

    def __getitem__(self, index):
        # return self._data[index], self._label[index]
        return self._data[index], (self._label[index], self._label[index])

    def __len__(self):
        return len(self._data)

if __name__ == '__main__':
    np.random.seed(42)
    dataset = RandomScalarDataset()
    generator_dataset = GeneratorDataset(source=dataset, column_names=['scalar', 'target'])
    it = generator_dataset.create_tuple_iterator()
    for i in it: print(i)

    it = generator_dataset.create_dict_iterator()
    for i in it: print(i)

输出

[Tensor(shape=[2], dtype=Float64, value= [ 3.74540119e-01,  9.50714306e-01]), Tensor(shape=[2, 1], dtype=UInt64, value=
[[35],
 [35]])]
[Tensor(shape=[2], dtype=Float64, value= [ 7.31993942e-01,  5.98658484e-01]), Tensor(shape=[2, 1], dtype=UInt64, value=
[[39],
 [39]])]
[Tensor(shape=[2], dtype=Float64, value= [ 1.56018640e-01,  1.55994520e-01]), Tensor(shape=[2, 1], dtype=UInt64, value=
[[23],
 [23]])]
[Tensor(shape=[2], dtype=Float64, value= [ 5.80836122e-02,  8.66176146e-01]), Tensor(shape=[2, 1], dtype=UInt64, value=
[[2],
 [2]])]
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 5.80836122e-02,  8.66176146e-01]), 'target': Tensor(shape=[2, 1], dtype=UInt64, value=
[[2],
 [2]])}
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 3.74540119e-01,  9.50714306e-01]), 'target': Tensor(shape=[2, 1], dtype=UInt64, value=
[[35],
 [35]])}
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 1.56018640e-01,  1.55994520e-01]), 'target': Tensor(shape=[2, 1], dtype=UInt64, value=
[[23],
 [23]])}
{'scalar': Tensor(shape=[2], dtype=Float64, value= [ 7.31993942e-01,  5.98658484e-01]), 'target': Tensor(shape=[2, 1], dtype=UInt64, value=
[[39],
 [39]])}