一、IMDB数据集
1.1、下载数据集
from __future__ import absolute_import, division, print_function
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
# 加载IMDB数据集
(train_data, test_data), info = tfds.load(
# 数据集
'imdb_reviews/subwords8k',
# 训练集,数据集以tuple形式返回
split=(tfds.Split.TRAIN, tfds.Split.TEST),
# Return (example, label) pairs from the dataset (instead of a dictionary).
as_supervised=True,
# 返回`info`的结构
with_info=True)
1.2、探索数据集
1.2.1、encoder
print(type(train_data)) #
print(type(test_data))
print(type(info))
encoder = info.features['text'].encoder
print(type(encoder))
print('Vocabulary size: {}'.format(encoder.vocab_size))
sample_word = 'hello tensorflow'
encoded_Arr = encoder.encode(sample_word) # 编码
print(encoded_Arr) # [3618, 222, 943, 2327, 2934]
original_word = encoder.decode(encoded_Arr) # 解码
print(original_word)
print(sample_word == original_word)
for ts in encoded_Arr:
print(ts, '--->', encoder.decode([ts]))
1.2.2、探索数据内容与格式
for train_example, train_label in train_data.take(1):
# 每个example都是一个数值数据,表示这电影评论
print(train_example[0:10]) # tf.Tensor([ 249 4 277 309 560 6 6639 4574 2 12], shape=(10,), dtype=int64)
print(encoder.decode(train_example)) # 解码 评论: As a lifelong fan of Dickens, I have invariably been disappointed by adaptations of his novels.Altho。。。。
print(train_label) # 标签0: negative/1: positive