HW4 Speaker classification-SIMPLE (TRANSFORMER)

2023-12-22 19:01:00


在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

Task description
Classify the speakers of given features.

Main goal: Learn how to use transformer.

Baselines:

Easy: Run sample code and know how to use transformer.
Medium: Know how to adjust parameters of transformer.
Strong: Construct conformer which is a variety of transformer.
Boss: Implement Self-Attention Pooling & Additive Margin Softmax to further boost the performance.
Other links

Kaggle: [link](https://www.kaggle.com/competitions/ml2022spring-hw4)
Slide: link
Data: link
import torch
import torch.nn as nn
import torch.nn.functional as F


class Classifier(nn.Module):
	def __init__(self, d_model=224, n_spks=600, dropout=0.2):
		super().__init__()
		# Project the dimension of features from that of input into d_model.
		self.prenet = nn.Linear(40, d_model)
		# TODO:
		#   Change Transformer to Conformer.
		#   https://arxiv.org/abs/2005.08100
		self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, dim_feedforward=d_model*2, nhead=2, dropout=dropout)
		self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=3)

		# Project the the dimension of features from d_model into speaker nums.
		self.pred_layer = nn.Sequential(
			nn.BatchNorm1d(d_model),
			#nn.Linear(d_model, d_model),
			#nn.ReLU(),
			nn.Linear(d_model, n_spks),
		)

	def forward(self, mels):
		"""
		args:
			mels: (batch size, length, 40)
		return:
			out: (batch size, n_spks)
		"""
		# out: (batch size, length, d_model)
		out = self.prenet(mels)
		# out: (length, batch size, d_model)
		out = out.permute(1, 0, 2)
		# The encoder layer expect features in the shape of (length, batch size, d_model).
		out = self.encoder(out)
		# out: (batch size, length, d_model)
		out = out.transpose(0, 1) 
		# mean pooling
		stats = out.mean(dim=1)

		# out: (batch, n_spks)
		out = self.pred_layer(stats)
		return out
import torch
import torch.nn as nn
import torch.nn.functional as F
!pip install conformer
from conformer import ConformerBlock

class Classifier(nn.Module):
	def __init__(self, d_model=224, n_spks=600, dropout=0.25):
		super().__init__()
		# Project the dimension of features from that of input into d_model.
		self.prenet = nn.Linear(40, d_model)
		# TODO:
		#   Change Transformer to Conformer.
		#   https://arxiv.org/abs/2005.08100
		#self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, dim_feedforward=d_model*2, nhead=2, dropout=dropout)
		#self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=3)
		self.encoder = ConformerBlock(
				dim = d_model,
				dim_head = 4,
				heads = 4,
				ff_mult = 4,
				conv_expansion_factor = 2,
				conv_kernel_size = 20,
				attn_dropout = dropout,
				ff_dropout = dropout,
				conv_dropout = dropout,
		)


		# Project the the dimension of features from d_model into speaker nums.
		self.pred_layer = nn.Sequential(
			nn.BatchNorm1d(d_model),
			#nn.Linear(d_model, d_model),
			#nn.ReLU(),
			nn.Linear(d_model, n_spks),
		)

	def forward(self, mels):
		"""
		args:
			mels: (batch size, length, 40)
		return:
			out: (batch size, n_spks)
		"""
		# out: (batch size, length, d_model)
		out = self.prenet(mels)
		# out: (length, batch size, d_model)
		out = out.permute(1, 0, 2)
		# The encoder layer expect features in the shape of (length, batch size, d_model).
		out = self.encoder(out)
		# out: (batch size, length, d_model)
		out = out.transpose(0, 1) 
		# mean pooling
		stats = out.mean(dim=1)

		# out: (batch, n_spks)
		out = self.pred_layer(stats)
		return out

文章来源:https://blog.csdn.net/weixin_39107270/article/details/135153120
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。