import argparse import librosa import numpy as np from PySide6.QtCore import QPointF, Qt from PySide6.QtGui import QColor, QImage, QPainter, QPen from scipy.interpolate import RegularGridInterpolator import movis as mv from movis.imgproc import qimage_to_numpy def get_audio_image(path: str): audio, sampling_rate = librosa.load(path) duration = len(audio) / sampling_rate freq = np.abs(librosa.stft(audio, n_fft=2048, hop_length=512)) db_array = librosa.amplitude_to_db(freq, ref=np.max) m, M = db_array.min(), db_array.max() db_array = (db_array - m) / (M - m) p = np.percentile(db_array.mean(axis=1), 5) db_array = db_array[db_array.mean(axis=1) > p, :] y_linear = np.linspace(0, 1, 256) y = np.linspace(0, 1, db_array.shape[0]) x = np.linspace(0, 1, db_array.shape[1]) interpolator = RegularGridInterpolator((y, x), db_array) db_resampled = interpolator(np.array([[(yy, xx) for xx in x] for yy in y_linear])) return db_resampled, duration class FrequencyLayer: def __init__(self, audio_img: np.ndarray, duration: float, size: tuple[int, int], mode: str = 'line'): self.audio_img = audio_img self.duration = duration self.size = size self.mode = mode self.margin = 10 self.length = 200 def __call__(self, time: float) -> np.ndarray: if time < 0 or self.duration < time: return np.zeros((self.size[1], self.size[0], 4), dtype=np.uint8) w = self.audio_img.shape[1] i = int(time * w / self.duration) array = self.audio_img[:, i] image = QImage(self.size[0], self.size[1], QImage.Format.Format_ARGB32) image.fill(QColor(0, 0, 0, 0)) painter = QPainter(image) painter.setRenderHint(QPainter.RenderHint.Antialiasing, True) pen = QPen(QColor(255, 255, 255, 255)) pen.setWidthF(5.0) pen.setCapStyle(Qt.PenCapStyle.RoundCap) painter.setPen(pen) if self.mode == 'line': points = np.linspace( self.margin, self.size[0] - self.margin, len(array), dtype=np.float64) for px, v in zip(points, array): h = v * (self.size[1] - self.margin * 2) painter.drawLine(QPointF(px, (self.size[1] - h) / 2), QPointF(px, (self.size[1] + h) / 2)) elif self.mode == 'circle': n_point = len(array) theta = np.linspace(0., 2 * np.pi, n_point, endpoint=False) center = np.array([self.size[0] / 2, self.size[1] / 2], dtype=float) radius = min(self.size[0], self.size[1]) / 2 - self.length / 2 - self.margin points = np.concatenate([np.cos(theta)[:, None], np.sin(theta)[:, None]], axis=1) points_start = center + radius * points points_end = center + (radius + array[:, None] * self.length / 2) * points for p0, p1 in zip(points_start, points_end): painter.drawLine(QPointF(p0[0], p0[1]), QPointF(p1[0], p1[1])) else: raise ValueError painter.end() return qimage_to_numpy(image) def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--type', choices=['line', 'circle'], default='line') parser.add_argument('-i', '--input', default='nyancat.mp3') parser.add_argument('-o', '--output', default='output.mp4') parser.add_argument('--background', default='bg.jpg') parser.add_argument('--no-logo', action='store_true') args = parser.parse_args() size = (1920, 1080) eps = 0.1 audio_img, duration = get_audio_image(args.input) scene = mv.layer.Composition(size, duration=duration + eps) scene.add_layer(mv.layer.Image(args.background, duration=duration + eps)) if not args.no_logo: logo_position = (size[0] // 2, size[1] // 2 - 200) if args.type == 'line' \ else (size[0] // 2, size[1] // 2) scene.add_layer( mv.layer.Image('logo.png', duration=duration + eps), position=logo_position) freq_size = (1920, 256) if args.type == 'line' else (1080, 1080) freq_position = (size[0] // 2, size[1] // 2 + 200) if args.type == 'line' \ else (size[0] // 2, size[1] // 2) scene.add_layer( FrequencyLayer(audio_img, duration, freq_size, mode=args.type), position=freq_position, opacity=0.9) scene.add_layer(mv.layer.Audio(args.input)) scene.write_video(args.output, audio=True) if __name__ == '__main__': main()