Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPEG: Find ID3v2 tags buried in junk #320

Merged
merged 4 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- **ID3v1**: Renamed `GENRES[14]` to `"R&B"` (Previously `"Rhythm & Blues"`) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/296))
- **MP4**: Duration milliseconds are now rounded to the nearest whole number ([PR](https://github.com/Serial-ATA/lofty-rs/pull/298))
- **ID3v2**: Stop erroring on empty frames when not using `ParsingMode::Strict` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/299))
- **resolve**: Custom resolvers will now be checked before the default resolvers ([PR](https://github.com/Serial-ATA/lofty-rs/pull/319))
- **MPEG**: Up to `max_junk_bytes` will now be searched for tags between the start of the file and the first MPEG frame ([PR](https://github.com/Serial-ATA/lofty-rs/pull/320))
- This allows us to read and write ID3v2 tags that are preceeded by junk

### Fixed
- **MP4**:
Expand Down
1 change: 0 additions & 1 deletion src/ape/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ where
let mut ape_tag: Option<ApeTag> = None;

// ID3v2 tags are unsupported in APE files, but still possible
#[allow(unused_variables)]
if let ID3FindResults(Some(header), Some(content)) = find_id3v2(data, true)? {
log::warn!("Encountered an ID3v2 tag. This tag cannot be rewritten to the APE file!");

Expand Down
23 changes: 9 additions & 14 deletions src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ impl FileType {
/// ```
pub fn from_buffer(buf: &[u8]) -> Option<Self> {
match Self::from_buffer_inner(buf) {
FileTypeGuessResult::Determined(file_ty) => Some(file_ty),
Some(FileTypeGuessResult::Determined(file_ty)) => Some(file_ty),
// We make no attempt to search past an ID3v2 tag or junk here, since
// we only provided a fixed-sized buffer to search from.
//
Expand All @@ -903,33 +903,30 @@ impl FileType {
}

// TODO: APE tags in the beginning of the file
pub(crate) fn from_buffer_inner(buf: &[u8]) -> FileTypeGuessResult {
pub(crate) fn from_buffer_inner(buf: &[u8]) -> Option<FileTypeGuessResult> {
use crate::id3::v2::util::synchsafe::SynchsafeInteger;

// Start out with an empty return
let mut ret = FileTypeGuessResult::Undetermined;
let mut ret = None;

if buf.is_empty() {
return ret;
}

match Self::quick_type_guess(buf) {
Some(f_ty) => ret = FileTypeGuessResult::Determined(f_ty),
Some(f_ty) => ret = Some(FileTypeGuessResult::Determined(f_ty)),
// Special case for ID3, gets checked in `Probe::guess_file_type`
// The bare minimum size for an ID3v2 header is 10 bytes
None if buf.len() >= 10 && &buf[..3] == b"ID3" => {
// This is infallible, but preferable to an unwrap
if let Ok(arr) = buf[6..10].try_into() {
// Set the ID3v2 size
ret =
FileTypeGuessResult::MaybePrecededById3(u32::from_be_bytes(arr).unsynch());
ret = Some(FileTypeGuessResult::MaybePrecededById3(
u32::from_be_bytes(arr).unsynch(),
));
}
},
None if buf.first().copied() == Some(0) => {
ret = FileTypeGuessResult::MaybePrecededByJunk
},
// We aren't able to determine a format
_ => {},
None => ret = Some(FileTypeGuessResult::MaybePrecededByJunk),
}

ret
Expand Down Expand Up @@ -1020,8 +1017,6 @@ pub(crate) enum FileTypeGuessResult {
Determined(FileType),
/// The stream starts with an ID3v2 tag
MaybePrecededById3(u32),
/// The stream starts with junk zero bytes
/// The stream starts with potential junk data
MaybePrecededByJunk,
/// The `FileType` could not be guessed
Undetermined,
}
2 changes: 2 additions & 0 deletions src/id3/v2/frame/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ where
let mut id_end = 4;
let mut invalid_v2_frame = false;
if header[3] == 0 && !synchsafe {
log::warn!("Found a v2 frame ID in a v3 tag, attempting to upgrade");

invalid_v2_frame = true;
id_end = 3;
}
Expand Down
6 changes: 6 additions & 0 deletions src/id3/v2/frame/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ impl<'a> ParsedFrame<'a> {

// Get the encryption method symbol
if let Some(enc) = flags.encryption.as_mut() {
log::trace!("Reading encryption method symbol");

if size < 1 {
return Err(Id3v2Error::new(Id3v2ErrorKind::BadFrameLength).into());
}
Expand All @@ -74,6 +76,8 @@ impl<'a> ParsedFrame<'a> {

// Get the group identifier
if let Some(group) = flags.grouping_identity.as_mut() {
log::trace!("Reading group identifier");

if size < 1 {
return Err(Id3v2Error::new(Id3v2ErrorKind::BadFrameLength).into());
}
Expand All @@ -84,6 +88,8 @@ impl<'a> ParsedFrame<'a> {

// Get the real data length
if flags.data_length_indicator.is_some() || flags.compression {
log::trace!("Reading data length indicator");

if size < 4 {
return Err(Id3v2Error::new(Id3v2ErrorKind::BadFrameLength).into());
}
Expand Down
2 changes: 1 addition & 1 deletion src/id3/v2/write/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ pub(crate) fn write_id3v2<'a, I: Iterator<Item = FrameRef<'a>> + Clone + 'a>(

// Unable to determine a format
if file_type.is_none() {
err!(UnsupportedTag);
err!(UnknownFormat);
}

let file_type = file_type.unwrap();
Expand Down
74 changes: 71 additions & 3 deletions src/mpeg/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::id3::v2::read::parse_id3v2;
use crate::id3::{find_id3v1, find_lyrics3v2, ID3FindResults};
use crate::macros::{decode_err, err};
use crate::mpeg::header::HEADER_MASK;
use crate::probe::ParseOptions;
use crate::probe::{ParseOptions, ParsingMode};

use std::io::{Read, Seek, SeekFrom};

Expand All @@ -32,6 +32,9 @@ where
while let Ok(()) = reader.read_exact(&mut header) {
match header {
// [I, D, 3, ver_major, ver_minor, flags, size (4 bytes)]
//
// Best case scenario, we find an ID3v2 tag at the beginning of the file.
// We will check again after finding the frame sync, in case the tag is buried in junk.
[b'I', b'D', b'3', ..] => {
// Seek back to read the tag in full
reader.seek(SeekFrom::Current(-4))?;
Expand All @@ -57,6 +60,8 @@ where

continue;
},
// TODO: APE tags may suffer the same issue as ID3v2 tag described above.
// They are not nearly as important to preserve, however.
[b'A', b'P', b'E', b'T'] => {
log::warn!(
"Encountered an APE tag at the beginning of the file, attempting to read"
Expand Down Expand Up @@ -85,9 +90,50 @@ where
reader.seek(SeekFrom::Current(-1 * header.len() as i64))?;

#[allow(clippy::used_underscore_binding)]
if let Some((_first_first_header, _first_frame_offset)) = find_next_frame(reader)? {
if let Some((_first_frame_header, _first_frame_offset)) = find_next_frame(reader)? {
// TODO: We are manually searching through junk here, this could potentially be moved into `find_id3v2()`
if file.id3v2_tag.is_none()
&& parse_options.parsing_mode != ParsingMode::Strict
&& _first_frame_offset > 0
{
reader.seek(SeekFrom::Start(0))?;

let search_window_size =
std::cmp::min(_first_frame_offset, parse_options.max_junk_bytes as u64);
let mut id3v2_search_window = reader.take(search_window_size);

// TODO: A whole lot of code duplication here, its nearly identical to what we did above
if let Some(id3v2_offset) = find_id3v2_in_junk(&mut id3v2_search_window)? {
log::warn!(
"Found an ID3v2 tag preceded by junk data, offset: {}",
id3v2_offset
);

reader.seek(SeekFrom::Current(-3))?;

let header = Id3v2Header::parse(reader)?;
let skip_footer = header.flags.footer;

let id3v2 = parse_id3v2(reader, header, parse_options.parsing_mode)?;
if let Some(existing_tag) = &mut file.id3v2_tag {
// https://github.com/Serial-ATA/lofty-rs/issues/87
// Duplicate tags should have their frames appended to the previous
for frame in id3v2.frames {
existing_tag.insert(frame);
}
continue;
}

if skip_footer {
reader.seek(SeekFrom::Current(10))?;
}

file.id3v2_tag = Some(id3v2);
}
}

first_frame_offset = _first_frame_offset;
first_frame_header = Some(_first_first_header);
first_frame_header = Some(_first_frame_header);
break;
}
},
Expand Down Expand Up @@ -187,3 +233,25 @@ where

Ok(None)
}

/// Searches for an ID3v2 tag in (potential) junk data between the start
/// of the file and the first frame
fn find_id3v2_in_junk<R>(reader: &mut R) -> Result<Option<u64>>
where
R: Read,
{
let bytes = reader.bytes();

let mut id3v2_header = [0; 3];

for (index, byte) in bytes.enumerate() {
id3v2_header[0] = id3v2_header[1];
id3v2_header[1] = id3v2_header[2];
id3v2_header[2] = byte?;
if id3v2_header == *b"ID3" {
return Ok(Some((index - 2) as u64));
}
}

Ok(None)
}
6 changes: 5 additions & 1 deletion src/probe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,11 @@ impl<R: Read + Seek> Probe<R> {
}

// Guess the file type by using these 36 bytes
match FileType::from_buffer_inner(&buf[..buf_len]) {
let Some(file_type_guess) = FileType::from_buffer_inner(&buf[..buf_len]) else {
return Ok(None);
};

match file_type_guess {
// We were able to determine a file type
FileTypeGuessResult::Determined(file_ty) => Ok(Some(file_ty)),
// The file starts with an ID3v2 tag; this means other data can follow (e.g. APE or MP3 frames)
Expand Down