Pytorch处理文本数据规范流程
Pytorch处理⽂本数据规范流程处理⽂本数据的Torch库torchtext
import torch
import torchtext
建⽴Fields,Field确定了处理数据的⽅式
from torchtext.data import Field
#torchtext.data.Field(
#sequential=True,#判断是否处理是序列数据,如果false,tokenization不会被应⽤
# use_vocab=True,#是否使⽤Vocab object,如果是false,数据本⾝就要是数字化的
# init_token=None, #将该token添加到该field的所有example开头
# eos_token=None,#将该token添加到该field的所有example结尾
# fix_length=None, #padding
# dtype=torch.int64,
# preprocessing=None,#tokenize之后,numercialize之前的操作
黑豆怎么吃最好# postprocessing=None, #numercialize之后,numbers转变为Tensor之前的操作
# lower=False,#是否全部⼩写
# tokenize=None, #=‘spacy’,Spacy tokenizer会被调⽤
# tokenizer_language='en',
# include_lengths=False,
# batch_first=False,#是否返回的tensor中bacth优先
# pad_token='<pad>', #⽤于padding的token
# unk_token='<unk>', #unknown的token
# pad_first=False, #是否在开始就padding
# truncate_first=False, #是否在开始就缩短
# stop_words=None, #在processing过程discard单词
# is_target=False)
#定义Field
LABEL = Field(sequential=False, use_vocab=False)
tokenize =lambda x: x.split()
祝福老师的歌
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
建⽴数据库
from torchtext.data import TabularDataset
# #TabularDataset:Defines a Dataset of columns stored in CSV, TSV, or JSON format.
# (
# path,
# format,:数据格式
# fields(list(tuple(str,Field))):如果是list,则format必须是CSV或者TSV,并且(str,fields)和column要⼀⼀对应
# skip_header(bool):是否跳过⽂件的第⼀⾏(尤其CSV⽂件)
五一放假几天2022年#cav_reader_params(dict): Parameters to pass to the csv reader. Only relevant when format is csv or tsv. See # )
#建⽴数据库
tv_datafields =[("id",None),# we won't be needing the id, so we pass in None as the field
机场行李寄存("comment_text", TEXT),("toxic", LABEL),
("severe_toxic", LABEL),("threat", LABEL),
("obscene", LABEL),("insult", LABEL),
("identity_hate", LABEL)]
#splits⽅法对train、dev的数据集创造了dataset,这个过程是相同的。
trn, vld = TabularDataset.splits(
path="data",# the root directory where the data lies
train='train.csv', validation="valid.csv",
format='csv',
skip_header=True,# if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
fields=tv_datafields)
tst_datafields =[("id",None),# we won't be needing the id, so we pass in None as the field
("comment_text", TEXT)]
tst = TabularDataset(
锦鲤的饲养方法path="data/test.csv",# the file path
format='csv',
skip_header=True,# if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
fields=tst_datafields)
trn[0]#Example对象,此时仅仅被分词并没有nummerlize
<ample.Example at 0x1a9d8d39d88>
#field.build_vocab()
# 这⾏代码使得 Torchtext遍历训练集中的绑定TEXT field的数据,将单词注册到vocabulary。⽽且,它可以⾃动构建embedding矩阵。对于oov的词将被标为<u nk>
TEXT.build_vocab(trn)
构造迭代器
#构造迭代器
from torchtext.data import Iterator, BucketIterator
#  torchtext.data.Iterator(
# dataset,
# batch_size,
# sort_key=None,#group相同长度的examples最⼩化padding
# device=None,
# batch_size_fn=None,
# train=True,#iterators是否代表训练集
# repeat=False, #是否在多轮迭代中重复迭代器
# shuffle=None,
# sort=None,
# sort_within_batch=None)
train_iter, val_iter = BucketIterator.splits(
(trn, vld),# we pass in the datasets we want the iterator to draw data from
batch_sizes=(64,64),
device=-1,# if you want to use the GPU, specify the GPU number here
sort_key=lambda x:len(xment_text),# the BucketIterator needs to be told what function it should use to group the data.
sort_within_batch=False,
repeat=False# we pass repeat=False because we want to wrap this Iterator layer.
)
笔记本电脑哪个牌子
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)
for index,i in enumerate(train_iter):
print(index,i)
for j in i :
print(j)
[torchtext.data.batch.Batch of size 1]
[ment_text]:[torch.LongTensor of size 1x1]
[.toxic]:[torch.LongTensor of size 1]
[.severe_toxic]:[torch.LongTensor of size 1]
[.threat]:[torch.LongTensor of size 1]
[.obscene]:[torch.LongTensor of size 1]
[.insult]:[torch.LongTensor of size 1]
[.identity_hate]:[torch.LongTensor of size 1]
(tensor([[2]]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1])) None

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。