from fastai.data.all import *
from fastai.vision.all import *
path = untar_data(URLs.PETS)
fnames = get_image_files(path/"images")
dblock = DataBlock()
dsets = dblock.datasets(fnames)
dsets.train[0]
>
(Path('/home/jhoward/.fastai/data/oxford-iiit-pet/images/Birman_82.jpg'),
Path('/home/jhoward/.fastai/data/oxford-iiit-pet/images/Birman_82.jpg'))
# By default, the data block API assumes we have an input and a target, which is why we see our filename repeated twice.
# The first thing we can do is use a get_items function to actually assemble our items inside the data block:
dblock = DataBlock(get_items = get_image_files)
dsets = dblock.datasets(path/"images")
dsets.train[0]
# specify label func
def label_func(fname):
return "cat" if fname.name[0].isupper() else "dog"
dblock = DataBlock(get_items = get_image_files,
get_y = label_func)
dsets = dblock.datasets(path/"images")
dsets.train[0]
# specify types
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files,
get_y = label_func)
dsets = dblock.datasets(path/"images")
dsets.train[0]
dsets.vocab
> ['cat', 'dog']
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
get_items = get_image_files,
get_y = label_func,
splitter = RandomSplitter(),
item_tfms = Resize(224))
dls = dblock.dataloaders(path/"images")
dls.show_batch()
mnist = DataBlock(blocks=(ImageBlock(cls=PILImageBW), CategoryBlock),
get_items=get_image_files,
splitter=GrandparentSplitter(),
get_y=parent_label)
dls = mnist.dataloaders(untar_data(URLs.MNIST_TINY))
dls.show_batch(max_n=9, figsize=(4,4))
mnist.summary(untar_data(URLs.MNIST_TINY))
# resize and augmentation
pets = DataBlock(blocks=(ImageBlock, CategoryBlock),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=Pipeline([attrgetter("name"), RegexLabeller(pat = r'^(.*)_\d+.jpg$')]),
item_tfms=Resize(128),
batch_tfms=aug_transforms())
dls = pets.dataloaders(untar_data(URLs.PETS)/"images")
dls.show_batch(max_n=9)
pascal_source = untar_data(URLs.PASCAL_2007)
df = pd.read_csv(pascal_source/"train.csv")
fname labels is_valid
0 000005.jpg chair True
1 000007.jpg car True
2 000009.jpg horse person True
3 000012.jpg car False
4 000016.jpg bicycle True
# get_x=ColReader(0, pref=pascal_source/"train"),
# 意思是读取第0列的值,并且添加前缀生成文件名
pascal = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
splitter=ColSplitter(),
get_x=ColReader(0, pref=pascal_source/"train"),
get_y=ColReader(1, label_delim=' '),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
dls = pascal.dataloaders(df)
dls.show_batch()
#
pascal = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
splitter=ColSplitter(),
get_x=lambda x:pascal_source/"train"/f'{x[0]}',
get_y=lambda x:x[1].split(' '),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
dls = pascal.dataloaders(df)
dls.show_batch()
# use lambda
pascal = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
splitter=ColSplitter(),
get_x=lambda o:f'{pascal_source}/train/'+o.fname,
get_y=lambda o:o.labels.split(),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
dls = pascal.dataloaders(df)
dls.show_batch()
# use columns as attributes
pascal = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
splitter=ColSplitter(),
get_x=lambda o:f'{pascal_source}/train/'+o.fname,
get_y=lambda o:o.labels.split(),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
dls = pascal.dataloaders(df)
dls.show_batch()
# use from_columns
# The most efficient way (to avoid iterating over the rows of the dataframe, which can take a long time) is to use the from_columns method.
# It will use get_items to convert the columns into numpy arrays. The drawback is that since we lose the dataframe after extracting the relevant columns,
# we can’t use a ColSplitter anymore. Here we used an IndexSplitter after manually extracting the index of the validation set from the dataframe:
def _pascal_items(x): return (
f'{pascal_source}/train/'+x.fname, x.labels.str.split())
valid_idx = df[df['is_valid']].index.values
pascal = DataBlock.from_columns(blocks=(ImageBlock, MultiCategoryBlock),
get_items=_pascal_items,
splitter=IndexSplitter(valid_idx),
item_tfms=Resize(224),
batch_tfms=aug_transforms())
path = untar_data(URLs.CAMVID_TINY)
camvid = DataBlock(blocks=(ImageBlock, MaskBlock(codes = np.loadtxt(path/'codes.txt', dtype=str))),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=lambda o: path/'labels'/f'{o.stem}_P{o.suffix}',
batch_tfms=aug_transforms())
dls = camvid.dataloaders(path/"images")
dls.show_batch()
biwi_source = untar_data(URLs.BIWI_SAMPLE)
fn2ctr = load_pickle(biwi_source/'centers.pkl')
biwi = DataBlock(blocks=(ImageBlock, PointBlock),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=lambda o:fn2ctr[o.name].flip(0),
batch_tfms=aug_transforms())
dls = biwi.dataloaders(biwi_source)
dls.show_batch(max_n=9)
coco_source = untar_data(URLs.COCO_TINY)
images, lbl_bbox = get_annotations(coco_source/'train.json')
img2bbox = dict(zip(images, lbl_bbox))
coco = DataBlock(blocks=(ImageBlock, BBoxBlock, BBoxLblBlock),
get_items=get_image_files,
splitter=RandomSplitter(),
get_y=[lambda o: img2bbox[o.name][0], lambda o: img2bbox[o.name][1]],
item_tfms=Resize(128),
batch_tfms=aug_transforms(),
n_inp=1)
dls = coco.dataloaders(coco_source)
dls.show_batch(max_n=9)
from fastai.text.all import *
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head()
imdb_lm = DataBlock(blocks=TextBlock.from_df('text', is_lm=True),
get_x=ColReader('text'),
splitter=ColSplitter())
dls = imdb_lm.dataloaders(df, bs=64, seq_len=72)
dls.show_batch(max_n=6)
# You need to make sure to use the same seq_len in TextBlock and the Learner you will define later on.
imdb_clas = DataBlock(blocks=(TextBlock.from_df('text', seq_len=72, vocab=dls.vocab), CategoryBlock),
get_x=ColReader('text'),
get_y=ColReader('label'),
splitter=ColSplitter())
dls = imdb_clas.dataloaders(df, bs=64)
dls.show_batch()
from fastai.tabular.core import *
adult_source = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(adult_source/'adult.csv')
df.head()
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df))
to = TabularPandas(df, procs, cat_names, cont_names, y_names="salary", splits=splits, y_block=CategoryBlock)
dls = to.dataloaders()
dls.show_batch()