Slide 16
Slide 16 text
© DeNA Co., Ltd. 16
self.fft_length = fft_length
self.bp_forder = bp_forder
self.bp_low = bp_low
self.bp_high = bp_high
self.nlfer_thresh1 = nlfer_thresh1
self.nlfer_thresh2 = nlfer_thresh2
self.shc_numharms = shc_numharms
self.shc_window = shc_window
self.shc_maxpeaks = shc_maxpeaks
self.shc_pwidth = shc_pwidth
self.shc_thresh1 = shc_thresh1
self.shc_thresh2 = shc_thresh2
self.f0_double = f0_double
self.f0_half = f0_half
self.dp5_k1 = dp5_k1
self.dec_factor = dec_factor
self.nccf_thresh1 = nccf_thresh1
self.nccf_thresh2 = nccf_thresh2
self.nccf_maxcands = nccf_maxcands
self.nccf_pwidth = nccf_pwidth
self.merit_boost = merit_boost
self.merit_pivot = merit_pivot
self.merit_extra = merit_extra
self.median_value = median_value
self.dp_w1 = dp_w1
self.dp_w2 = dp_w2
self.dp_w3 = dp_w3
self.dp_w4 = dp_w4
self.spec_pitch_min_std = spec_pitch_min_std
self.pitch_half = pitch_half
self.pitch_half_sens = pitch_half_sens
self.pitch_double = pitch_double
self.pitch_double_sens = pitch_double_sens
self.smooth_factor = smooth_factor
self.smooth = smooth
self.ptch_typ = ptch_typ
def estimate(
self, wav: npt.NDArray[np.float32]
) -> tuple[npt.NDArray[np.float32], dict[str, Any]]:
wav = amfm_decompy.basic_tools.SignalObj(wav, self.sample_rate)
amfm_decompy.pYAAPT.PitchObj.PITCH_HALF = self.pitch_half
amfm_decompy.pYAAPT.PitchObj.PITCH_HALF_SENS = self.pitch_half_sens
amfm_decompy.pYAAPT.PitchObj.PITCH_DOUBLE = self.pitch_double
amfm_decompy.pYAAPT.PitchObj.PITCH_DOUBLE_SENS = self.pitch_double_sens
amfm_decompy.pYAAPT.PitchObj.SMOOTH_FACTOR = self.smooth_factor
amfm_decompy.pYAAPT.PitchObj.SMOOTH = self.smooth
amfm_decompy.pYAAPT.PitchObj.PTCH_TYP = self.ptch_typ
pitch = amfm_decompy.pYAAPT.yaapt(
wav,
frame_length=self.frame_length,
tda_frame_length=self.tda_frame_length,
frame_space=self.frame_space,
class YAAPT(PitchEstimator):
def __init__(
self,
sample_rate: int,
hop_length: int,
fmin: float = 60.0,
fmax: float = 400.0,
frame_length: float = 35.0,
tda_frame_length: float = 35.0,
frame_space: float = 10.0,
fft_length: int = 8192,
bp_forder: int = 150,
bp_low: float = 50.0,
bp_high: float = 1500.0,
nlfer_thresh1: float = 0.75,
nlfer_thresh2: float = 0.1,
shc_numharms: int = 3,
shc_window: float = 40.0,
shc_maxpeaks: int = 4,
shc_pwidth: float = 50.0,
shc_thresh1: float = 5.0,
shc_thresh2: float = 1.25,
f0_double: float = 150.0,
f0_half: float = 150.0,
dp5_k1: float = 11.0,
dec_factor: int = 1,
nccf_thresh1: float = 0.3,
nccf_thresh2: float = 0.9,
nccf_maxcands: int = 3,
nccf_pwidth: int = 5,
merit_boost: float = 0.20,
merit_pivot: float = 0.99,
merit_extra: float = 0.4,
median_value: int = 7,
dp_w1: float = 0.15,
dp_w2: float = 0.5,
dp_w3: float = 0.1,
dp_w4: float = 0.9,
spec_pitch_min_std: float = 0.05,
pitch_half: int = 0,
pitch_half_sens: float = 2.9,
pitch_double: int = 0,
pitch_double_sens: float = 2.9,
smooth_factor: int = 5,
smooth: int = 5,
ptch_typ: float = 100.0,
):
self.sample_rate = sample_rate
self.hop_length = hop_length
self.fmin = fmin
self.fmax = fmax
self.frame_length = frame_length
self.tda_frame_length = tda_frame_length
self.frame_space = frame_space
f0_min=self.fmin,
f0_max=self.fmax,
fft_length=self.fft_length,
bp_forder=self.bp_forder,
bp_low=self.bp_low,
bp_high=self.bp_high,
nlfer_thresh1=self.nlfer_thresh1,
nlfer_thresh2=self.nlfer_thresh2,
shc_numharms=self.shc_numharms,
shc_window=self.shc_window,
shc_maxpeaks=self.shc_maxpeaks,
shc_pwidth=self.shc_pwidth,
shc_thresh1=self.shc_thresh1,
shc_thresh2=self.shc_thresh2,
f0_double=self.f0_double,
f0_half=self.f0_half,
dp5_k1=self.dp5_k1,
dec_factor=self.dec_factor,
nccf_thresh1=self.nccf_thresh1,
nccf_thresh2=self.nccf_thresh2,
nccf_maxcands=self.nccf_maxcands,
nccf_pwidth=self.nccf_pwidth,
merit_boost=self.merit_boost,
merit_pivot=self.merit_pivot,
merit_extra=self.merit_extra,
median_value=self.median_value,
dp_w1=self.dp_w1,
dp_w2=self.dp_w2,
dp_w3=self.dp_w3,
dp_w4=self.dp_w4,
spec_pitch_min_std=self.spec_pitch_min_std,
)
stats = {}
for name in [
"nfft",
"frame_size",
"frame_jump",
"noverlap",
"mean_energy",
"energy",
"vuv",
"frame_pos",
"nframes",
"samp_values",
"values",
"edges",
"samp_interp",
"values_interp",
]:
if hasattr(pitch, name):
stats[name] = getattr(pitch, name)
return pitch.values[:: self.hop_length].copy(), stats