一个Text to speech开源框架,
import numpy as np
import tensorflow as tf
# Default hyperparameters
hparams = tf.contrib.training.HParams(
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
# text you may want to use “basic_cleaners“ or “transliteration_cleaners“.
num_mels = 80 #Number of mel-spectrogram channels and local conditioning dimensionality
num_freq = 1025 # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing network
rescale = True #Whether to rescale audio prior to preprocessing
rescaling_max = 0.999 #Rescaling value
trim_silence = True #Whether to clip silence in Audio (at beginning and end of audio only not the middle)
clip_mels_length = True #For cases of OOM (Not really recommended working on a workaround)
max_mel_frames = 1300 #Only relevant when clip_mels_length = True
# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
# It‘s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
# Does not work if n_ffit is not multiple of hop_size!!
silence_threshold=2 #silence threshold used for sound trimming for wavenet preprocessing
#Mel spectrogram
n_fft = 2048 #Extra window size is filled with 0 paddings to match this parameter
hop_size = 300 #For 22050Hz 275 ~= 12.5 ms
win_size = 1200 #For 22050Hz 1100 ~= 50 ms (If None win_size = n_fft)
sample_rate = 24000 #22050 Hz (corresponding to ljspeech dataset)
frame_shift_ms = None
#M-AILABS (and other datasets) trim params
trim_fft_size = 512
trim_hop_size = 128
trim_top_db = 23
#Mel and Linear spectrograms normalization/scaling and clipping
signal_normalization = True
allow_clipping_in_normalization = True #Only relevant if mel_normalization = True
symmetric_mels = False #Whether to scale the data to be symmetric around 0
max_abs_value = 4. #max absolute value of data. If symmetric data will be [-max max] else [0 max]
normalize_for_wavenet = True #whether to rescale to [0 1] for wavenet.
min_level_db = -100
ref_level_db = 20
fmin = 0 #Set this to 75 if your speaker is male! if female 125 should help taking off noise. (To test depending on dataset)
fmax = 7600
#Griffin Lim
power = 1.5
griffin_lim_iters = 60
outputs_per_step = 2 #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
stop_at_any = True #Determines whether the decoder should stop when predicting to any frame or to all of them
embedding_dim = 512 #dimension of embedding space
enc_conv_num_layers = 3 #number of encoder convolutional layers
