When working with large collections of audios, one interesting dimension of the dataset is the length of the audios in seconds. Here we explore ways of calculating the audio lengths using python. We start of by creating a audio file.
import wave
import numpy as np
samplerate = 44100
t = np.linspace(0, 1, int(1.6*samplerate))
audio = 0.5 * np.sin(2 * np.pi * 440.0 * t)
# Convert to (little-endian) 16 bit integers.
audio = (audio * (2 ** 15 - 1)).astype("<h")
fn = 'sound1.wav'
with wave.open(fn, "w") as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(samplerate)
f.writeframes(audio.tobytes())
Simplest way of analysing an audio is to use an existing program, like soxi, as a subprocess.
import subprocess
subprocess.run(["soxi", "-D", fn], stdout=subprocess.PIPE).stdout
b'1.600000\n'
If we are working with an wave file, we could decode its header and use information stored there. Other audio formats like flac also provides the necessary values in its header.
The audio file is encoded into a wave container. The wave container has a 44 byte header:
|bytes|value |description |
|1-4 |"RIFF" |Marks the file as a riff file |
|5-8 |file_size |overall size of the file - 8 bytes |
|9-12 |"WAVE" |Marks the file as WAVE |
|13-16|"fmt " |Format chunk marker |
|17-20|fmt_length |Length of format data |
|21-22|type |Type of format (1=PCM) |
|23-24|channels |Number of channels |
|25-28|sample_rate |Number of samples per second |
|29-32|bytes_per_second |Number of bytes per second |
|33-34|bytes_per_sample |Number of bytes per sample |
|35-36|bits_per_sample |Sample bit size |
|37-40|"data" |Start of data section |
|40-44|data_length |Size of data section |
import struct
fieldnames = "riff,file_size,wave,fmt,fmt_length,type,channels,sample_rate,bytes_per_second,bytes_per_sample,bits_per_sample,data,data_length".split(",")
fields = struct.unpack('<LLLLLHHLLHHLL', open(fn, 'rb').read(44))
header = dict(zip(fieldnames, fields))
header
{'riff': 1179011410, 'file_size': 141156, 'wave': 1163280727, 'fmt': 544501094, 'fmt_length': 16, 'type': 1, 'channels': 1, 'sample_rate': 44100, 'bytes_per_second': 88200, 'bytes_per_sample': 2, 'bits_per_sample': 16, 'data': 1635017060, 'data_length': 141120}
header['data_length']/header["bytes_per_second"]
1.6
An third attractive method is to use the libsox library from python. This provides a nice middle ground between implementation details, code maintenance and speed. In order to use the libsox, we need to describe the data structures used in the library to python. We do this with ctypes.
from ctypes import *
from ctypes.util import find_library
import subprocess
libsox = cdll.LoadLibrary(find_library("sox"))
class SignalInfo(Structure):
"""
sox_signalinfo_t as per source (1)
(1) https://sourceforge.net/p/sox/code/ci/master/tree/src/sox.h#l1349
"""
_fields_ = [
("rate", c_double),
("channels", c_ubyte),
("precision", c_ubyte),
("length", c_ulonglong),
("mult", c_void_p),
]
class EncodingInfo(Structure):
"""
sox_encodinginfo_t as per source (1)
(1) https://sourceforge.net/p/sox/code/ci/master/tree/src/sox.h#l1371
"""
_fields_ = [
("encoding", c_int),
("bits_per_sample", c_uint),
("reverse_bytes", c_int),
("reverse_nibbles", c_int),
("reverse_bits", c_int),
("opposite_endian", c_bool),
]
class AudioInfo(Structure):
"""
Start of sox_format_t as per source (1)
(1) https://sourceforge.net/p/sox/code/ci/master/tree/src/sox.h#l1500
"""
_fields_ = [
("filename", c_char_p),
("signal", SignalInfo),
("encoding", EncodingInfo),
]
soxopen = libsox.sox_open_read
soxopen.restype = POINTER(AudioInfo)
soxclose = libsox.sox_close
def dictify(it):
"""Convert structure into a dict"""
if isinstance(it, bytes):
return it.decode()
if isinstance(it, (int, str, bool, float)) or it is None:
return it
keys = [k for k in dir(it) if '_' != k[0]]
return {k: dictify(getattr(it, k)) for k in keys}
def get_audioinfo(fn):
ret = soxopen(fn.encode(), None, None, None)
if ret:
retval = dictify(ret.contents)
soxclose(ret)
return retval
meta = get_audioinfo(fn)
meta
{'encoding': {'bits_per_sample': 16, 'encoding': 1, 'opposite_endian': False, 'reverse_bits': 0, 'reverse_bytes': 0, 'reverse_nibbles': 2146435072}, 'filename': 'sound1.wav', 'signal': {'channels': 1, 'length': 70560, 'mult': None, 'precision': 0, 'rate': 44100.0}}
meta["signal"]["length"]
70560
meta["signal"]["length"]/max(meta["signal"]["channels"], 1)/max(meta["signal"]["rate"], 1)
1.6
def get_duration_sox(fn):
return float(subprocess.run(["soxi", "-D", fn], stdout=subprocess.PIPE).stdout)
def get_duration_libsox(fn):
"""
Get audio duration as per source (1)
(1) https://sourceforge.net/p/sox/code/ci/master/tree/src/sox.c#l2679
"""
ainfo = get_audioinfo(fn)
if ainfo:
return meta["signal"]["length"]/max(meta["signal"]["channels"], 1)/max(meta["signal"]["rate"], 1)
def get_duration_struct(fn, check=False):
fieldnames = "riff,file_size,wave,fmt,fmt_length,type,channels,sample_rate,bytes_per_second,bytes_per_sample,bits_per_sample,data,data_length".split(",")
fields = struct.unpack('<LLLLLHHLLHHLL', open("sound1.wav", 'rb').read(44))
header = dict(zip(fieldnames, fields))
return header['data_length']/header["bytes_per_second"]
%timeit get_duration_sox(fn)
3.08 ms ± 162 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit get_duration_libsox(fn)
41.2 µs ± 2.81 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit get_duration_struct(fn)
12.6 µs ± 168 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
# This jupyter notebook cell is used to convert a notebook into publishable nicely styled html
from IPython.core.display import display, HTML
styles = """.container { width:100% !important; }"""
# display(HTML(f"<style> {styles} </style>"))
cell_type_names = ["jp-MarkdownOutput", "text_cell_render"]
output_styles_template = """
div.text_cell_render > p {
font-size: 21px;
line-height: 32px;
color: #444;
margin: 1em;
}
div.text_cell_render > p code {
font-size: 18px;
background-color: #ddd;
border: 1px solid #ccc;
color: #444;
}
div.text_cell_render > ul li {
font-size: 21px;
line-height: 16px;
color: #444;
margin: 1em;
}
div.text_cell_render > h2 {
font-size: 28px;
font-family: sans-serif;
color: #444;
margin: 1em;
}
div.text_cell_render > h3 {
font-size: 23px;
font-family: sans-serif;
color: #444;
margin: 1em;
}
div.text_cell_render > h1 {
font-weight: 400;
font-size: 52px;
margin: 2em;
letter-spacing: 0.04em;
color: #444;
}
""".replace(
";", " !important;"
)
output_styles = "\n".join([output_styles_template.replace("text_cell_render", name)
for name in cell_type_names])
styles = """
/* Hide the header and the footer (this node) */
/* #notebook-container > div:first-child { display: none; } */
#notebook-container > div:last-child { display: none; }
.container { width: 100%; }
@media only screen and (min-width: 1600px) {
.container { width: 1500px; }
}
/* .container { width: 100%; } */
.prompt { display: none; }
.output_stderr { display: none; }
.output_subarea {max-width: 100%; }
div.input { margin: 0 5em; }
div.output { margin: 0 5em; }
body { font-family: serif; }
thead {
border-bottom: 1px solid #444;
}
th, tr, td {
border: none;
}
table {
border: none;
color: #444;
}
""".replace(
";", " !important;"
)
display(HTML(f"<style> {styles} {output_styles}</style>"))