F0推定の手法を色々試してみる

Slide 1

Slide 1 text

Slide 14

Slide 14 text

© DeNA Co., Ltd. 14 class CREPE(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 50.0, fmax: float = torchcrepe.MAX_FMAX, model: Literal["full", "tiny"] = "full", decoder: Callable = torchcrepe.decode.viterbi, return_periodicity: bool = False, batch_size: Optional[int] = None, device="cpu", pad: bool = True, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.model = model self.decoder = decoder self.return_periodicity = return_periodicity self.batch_size = batch_size self.device = device self.pad = pad def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: if isinstance(wav, np.ndarray): wav = torch.from_numpy(wav).float() results = torchcrepe.predict( wav[None], sample_rate=self.sample_rate, hop_length=self.hop_length, fmin=self.fmin, fmax=self.fmax, model=self.model, decoder=self.decoder, return_periodicity=self.return_periodicity, batch_size=self.batch_size, device=self.device, pad=self.pad, ) if self.return_periodicity: pitch, periodicity = results pitch = pitch.squeeze_(0).numpy() periodicity = periodicity.numpy() return pitch, {"periodicity": periodicity} else: pitch = results.squeeze_(0).numpy() return pitch, {} class PYIN(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = librosa.note_to_hz("C2"), fmax: float = librosa.note_to_hz("C7"), frame_length: int = None, win_length: Optional[int] = None, n_thresholds: int = 100, beta_parameters: tuple[float, float] = (2, 18), boltzmann_parameter: float = 2, resolution: float = 0.1, max_transition_rate: float = 35.92, switch_prob: float = 0.01, no_trough_prob: float = 0.01, fill_na: Optional[float] = np.nan, center: bool = True, pad_mode: Union[ Literal[ "constant", "edge", "linear_ramp", "maximum", "mean", "median", "minimum", "reflect", "symmetric", "wrap", "empty", ], Callable, ] = "constant", ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.frame_length = sample_rate // 20 if frame_length is None else frame_length self.win_length = win_length self.n_thresholds = n_thresholds self.beta_parameters = beta_parameters self.boltzmann_parameter = boltzmann_parameter self.resolution = resolution self.max_transition_rate = max_transition_rate self.switch_prob = switch_prob self.no_trough_prob = no_trough_prob self.fill_na = fill_na self.center = center self.pad_mode = pad_mode def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: pitch, voiced_flag, voiced_prob = librosa.pyin( wav, fmin=self.fmin, fmax=self.fmax, sr=self.sample_rate, frame_length=self.frame_length, win_length=self.win_length, hop_length=self.hop_length, n_thresholds=self.n_thresholds, beta_parameters=self.beta_parameters, boltzmann_parameter=self.boltzmann_parameter, resolution=self.resolution, max_transition_rate=self.max_transition_rate, switch_prob=self.switch_prob, no_trough_prob=self.no_trough_prob, fill_na=self.fill_na, center=self.center, pad_mode=self.pad_mode, ) return np.nan_to_num(pitch.astype(np.float32)), { "voiced_flag": voiced_flag, "voiced_prob": voiced_prob, }

Slide 15

Slide 15 text

© DeNA Co., Ltd. 15 class DIO(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 71.0, fmax: float = 800.0, with_stonemask: bool = True, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.with_stonemask = with_stonemask def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: wav = wav.astype(np.float64) pitch, t = pyworld.dio( wav, fs=self.sample_rate, f0_ceil=self.fmax, f0_floor=self.fmin, frame_period=1000.0 * self.hop_length / self.sample_rate, ) if self.with_stonemask: pitch = pyworld.stonemask(wav, pitch, t, self.sample_rate) return pitch.astype(np.float32), {"t": t} class Harvest(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 71.0, fmax: float = 800.0, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: pitch, t = pyworld.harvest( wav.astype(np.double), fs=self.sample_rate, f0_ceil=self.fmax, f0_floor=self.fmin, frame_period=1000.0 * self.hop_length / self.sample_rate, ) return pitch.astype(np.float32), {"t": t} class RAPT(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 60.0, fmax: float = 240.0, voice_bias: float = 0.0, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.voice_bias = voice_bias def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: pitch = pysptk.rapt( (wav * 32767.0).astype(np.float32), fs=self.sample_rate, hopsize=self.hop_length, min=self.fmin, max=self.fmax, voice_bias=self.voice_bias, ) return pitch.astype(np.float32), {} class SWIPE(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 60.0, fmax: float = 240.0, threshold: float = 0.3, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.threshold = threshold def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: pitch = pysptk.swipe( (wav * 32767.0).astype(np.float64), fs=self.sample_rate, hopsize=self.hop_length, min=self.fmin, max=self.fmax, threshold=self.threshold, ) return pitch.astype(np.float32), {}

Slide 16

Slide 16 text

© DeNA Co., Ltd. 16 self.fft_length = fft_length self.bp_forder = bp_forder self.bp_low = bp_low self.bp_high = bp_high self.nlfer_thresh1 = nlfer_thresh1 self.nlfer_thresh2 = nlfer_thresh2 self.shc_numharms = shc_numharms self.shc_window = shc_window self.shc_maxpeaks = shc_maxpeaks self.shc_pwidth = shc_pwidth self.shc_thresh1 = shc_thresh1 self.shc_thresh2 = shc_thresh2 self.f0_double = f0_double self.f0_half = f0_half self.dp5_k1 = dp5_k1 self.dec_factor = dec_factor self.nccf_thresh1 = nccf_thresh1 self.nccf_thresh2 = nccf_thresh2 self.nccf_maxcands = nccf_maxcands self.nccf_pwidth = nccf_pwidth self.merit_boost = merit_boost self.merit_pivot = merit_pivot self.merit_extra = merit_extra self.median_value = median_value self.dp_w1 = dp_w1 self.dp_w2 = dp_w2 self.dp_w3 = dp_w3 self.dp_w4 = dp_w4 self.spec_pitch_min_std = spec_pitch_min_std self.pitch_half = pitch_half self.pitch_half_sens = pitch_half_sens self.pitch_double = pitch_double self.pitch_double_sens = pitch_double_sens self.smooth_factor = smooth_factor self.smooth = smooth self.ptch_typ = ptch_typ def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: wav = amfm_decompy.basic_tools.SignalObj(wav, self.sample_rate) amfm_decompy.pYAAPT.PitchObj.PITCH_HALF = self.pitch_half amfm_decompy.pYAAPT.PitchObj.PITCH_HALF_SENS = self.pitch_half_sens amfm_decompy.pYAAPT.PitchObj.PITCH_DOUBLE = self.pitch_double amfm_decompy.pYAAPT.PitchObj.PITCH_DOUBLE_SENS = self.pitch_double_sens amfm_decompy.pYAAPT.PitchObj.SMOOTH_FACTOR = self.smooth_factor amfm_decompy.pYAAPT.PitchObj.SMOOTH = self.smooth amfm_decompy.pYAAPT.PitchObj.PTCH_TYP = self.ptch_typ pitch = amfm_decompy.pYAAPT.yaapt( wav, frame_length=self.frame_length, tda_frame_length=self.tda_frame_length, frame_space=self.frame_space, class YAAPT(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 60.0, fmax: float = 400.0, frame_length: float = 35.0, tda_frame_length: float = 35.0, frame_space: float = 10.0, fft_length: int = 8192, bp_forder: int = 150, bp_low: float = 50.0, bp_high: float = 1500.0, nlfer_thresh1: float = 0.75, nlfer_thresh2: float = 0.1, shc_numharms: int = 3, shc_window: float = 40.0, shc_maxpeaks: int = 4, shc_pwidth: float = 50.0, shc_thresh1: float = 5.0, shc_thresh2: float = 1.25, f0_double: float = 150.0, f0_half: float = 150.0, dp5_k1: float = 11.0, dec_factor: int = 1, nccf_thresh1: float = 0.3, nccf_thresh2: float = 0.9, nccf_maxcands: int = 3, nccf_pwidth: int = 5, merit_boost: float = 0.20, merit_pivot: float = 0.99, merit_extra: float = 0.4, median_value: int = 7, dp_w1: float = 0.15, dp_w2: float = 0.5, dp_w3: float = 0.1, dp_w4: float = 0.9, spec_pitch_min_std: float = 0.05, pitch_half: int = 0, pitch_half_sens: float = 2.9, pitch_double: int = 0, pitch_double_sens: float = 2.9, smooth_factor: int = 5, smooth: int = 5, ptch_typ: float = 100.0, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.frame_length = frame_length self.tda_frame_length = tda_frame_length self.frame_space = frame_space f0_min=self.fmin, f0_max=self.fmax, fft_length=self.fft_length, bp_forder=self.bp_forder, bp_low=self.bp_low, bp_high=self.bp_high, nlfer_thresh1=self.nlfer_thresh1, nlfer_thresh2=self.nlfer_thresh2, shc_numharms=self.shc_numharms, shc_window=self.shc_window, shc_maxpeaks=self.shc_maxpeaks, shc_pwidth=self.shc_pwidth, shc_thresh1=self.shc_thresh1, shc_thresh2=self.shc_thresh2, f0_double=self.f0_double, f0_half=self.f0_half, dp5_k1=self.dp5_k1, dec_factor=self.dec_factor, nccf_thresh1=self.nccf_thresh1, nccf_thresh2=self.nccf_thresh2, nccf_maxcands=self.nccf_maxcands, nccf_pwidth=self.nccf_pwidth, merit_boost=self.merit_boost, merit_pivot=self.merit_pivot, merit_extra=self.merit_extra, median_value=self.median_value, dp_w1=self.dp_w1, dp_w2=self.dp_w2, dp_w3=self.dp_w3, dp_w4=self.dp_w4, spec_pitch_min_std=self.spec_pitch_min_std, ) stats = {} for name in [ "nfft", "frame_size", "frame_jump", "noverlap", "mean_energy", "energy", "vuv", "frame_pos", "nframes", "samp_values", "values", "edges", "samp_interp", "values_interp", ]: if hasattr(pitch, name): stats[name] = getattr(pitch, name) return pitch.values[:: self.hop_length].copy(), stats

Slide 17

Slide 17 text

© DeNA Co., Ltd. 17 class REAPER(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 40.0, fmax: float = 500.0, do_high_pass: bool = True, do_hilbert_transform: bool = False, inter_pulse: float = 0.01, unvoiced_cost: float = 0.1, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.do_high_pass = do_high_pass self.do_hilbert_transform = do_hilbert_transform self.inter_pulse = inter_pulse self.unvoiced_cost = unvoiced_cost def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: pm_times, pm, f0_times, f0, corr = pyreaper.reaper( np.round(wav * 32767.0).astype(np.int16), fs=self.sample_rate, minf0=self.fmin, maxf0=self.fmax, do_high_pass=self.do_high_pass, do_hilbert_transform=self.do_hilbert_transform, inter_pulse=self.inter_pulse, frame_period=self.hop_length / self.sample_rate, unvoiced_cost=self.unvoiced_cost, ) f0 = f0.clip(min=0.0) return f0.astype(np.float32), { "pm_times": pm_times, "pm": pm, "f0_times": f0_times, "corr": corr, } class FCPE(PitchEstimator): def __init__( self, sample_rate: int, hop_length: int, fmin: float = 80.0, fmax: float = 880.0, decoder_mode: Literal["argmax", "local_argmax"] = "local_argmax", threshold: float = 0.006, interp_uv: bool = False, ): self.sample_rate = sample_rate self.hop_length = hop_length self.fmin = fmin self.fmax = fmax self.decoder_mode = decoder_mode self.threshold = threshold self.interp_uv = interp_uv self.model = torchfcpe.spawn_bundled_infer_model() def estimate( self, wav: npt.NDArray[np.float32] ) -> tuple[npt.NDArray[np.float32], dict[str, Any]]: output_interp_target_length = (len(wav) // self.hop_length) + 1 wav = torch.from_numpy(wav).float()[None, ..., None] f0, uv = self.model.infer( wav, sr=self.sample_rate, decoder_mode="local_argmax", threshold=self.threshold, f0_min=self.fmin, f0_max=self.fmax, interp_uv=self.interp_uv, output_interp_target_length=output_interp_target_length, retur_uv=True, ) f0 = f0.squeeze(0).squeeze(-1).numpy() uv = uv.squeeze(0).squeeze(-1).numpy() return f0, {"uv": uv}

Slide 1

Slide 1 text

Slide 2

Slide 2 text

Slide 3

Slide 3 text

Slide 4

Slide 4 text

Slide 5

Slide 5 text

Slide 6

Slide 6 text

Slide 7

Slide 7 text

Slide 8

Slide 8 text

Slide 9

Slide 9 text

Slide 10

Slide 10 text

Slide 11

Slide 11 text

Slide 12

Slide 12 text

Slide 13

Slide 13 text