Serial-ATA · Serial-ATA · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **ID3v1**: Renamed `GENRES[14]` to `"R&B"` (Previously `"Rhythm & Blues"`) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/296))
 - **MP4**: Duration milliseconds are now rounded to the nearest whole number ([PR](https://github.com/Serial-ATA/lofty-rs/pull/298))
 - **ID3v2**: Stop erroring on empty frames when not using `ParsingMode::Strict` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/299))
+- **resolve**: Custom resolvers will now be checked before the default resolvers ([PR](https://github.com/Serial-ATA/lofty-rs/pull/319))
+- **MPEG**: Up to `max_junk_bytes` will now be searched for tags between the start of the file and the first MPEG frame ([PR](https://github.com/Serial-ATA/lofty-rs/pull/320))
+  - This allows us to read and write ID3v2 tags that are preceeded by junk
 
 ### Fixed
 - **MP4**:

diff --git a/src/ape/read.rs b/src/ape/read.rs
@@ -28,7 +28,6 @@ where
 	let mut ape_tag: Option<ApeTag> = None;
 
 	// ID3v2 tags are unsupported in APE files, but still possible
-	#[allow(unused_variables)]
 	if let ID3FindResults(Some(header), Some(content)) = find_id3v2(data, true)? {
 		log::warn!("Encountered an ID3v2 tag. This tag cannot be rewritten to the APE file!");
 

diff --git a/src/file.rs b/src/file.rs
@@ -893,7 +893,7 @@ impl FileType {
 	/// ```
 	pub fn from_buffer(buf: &[u8]) -> Option<Self> {
 		match Self::from_buffer_inner(buf) {
-			FileTypeGuessResult::Determined(file_ty) => Some(file_ty),
+			Some(FileTypeGuessResult::Determined(file_ty)) => Some(file_ty),
 			// We make no attempt to search past an ID3v2 tag or junk here, since
 			// we only provided a fixed-sized buffer to search from.
 			//
@@ -903,33 +903,30 @@ impl FileType {
 	}
 
 	// TODO: APE tags in the beginning of the file
-	pub(crate) fn from_buffer_inner(buf: &[u8]) -> FileTypeGuessResult {
+	pub(crate) fn from_buffer_inner(buf: &[u8]) -> Option<FileTypeGuessResult> {
 		use crate::id3::v2::util::synchsafe::SynchsafeInteger;
 
 		// Start out with an empty return
-		let mut ret = FileTypeGuessResult::Undetermined;
+		let mut ret = None;
 
 		if buf.is_empty() {
 			return ret;
 		}
 
 		match Self::quick_type_guess(buf) {
-			Some(f_ty) => ret = FileTypeGuessResult::Determined(f_ty),
+			Some(f_ty) => ret = Some(FileTypeGuessResult::Determined(f_ty)),
 			// Special case for ID3, gets checked in `Probe::guess_file_type`
 			// The bare minimum size for an ID3v2 header is 10 bytes
 			None if buf.len() >= 10 && &buf[..3] == b"ID3" => {
 				// This is infallible, but preferable to an unwrap
 				if let Ok(arr) = buf[6..10].try_into() {
 					// Set the ID3v2 size
-					ret =
-						FileTypeGuessResult::MaybePrecededById3(u32::from_be_bytes(arr).unsynch());
+					ret = Some(FileTypeGuessResult::MaybePrecededById3(
+						u32::from_be_bytes(arr).unsynch(),
+					));
 				}
 			},
-			None if buf.first().copied() == Some(0) => {
-				ret = FileTypeGuessResult::MaybePrecededByJunk
-			},
-			// We aren't able to determine a format
-			_ => {},
+			None => ret = Some(FileTypeGuessResult::MaybePrecededByJunk),
 		}
 
 		ret
@@ -1020,8 +1017,6 @@ pub(crate) enum FileTypeGuessResult {
 	Determined(FileType),
 	/// The stream starts with an ID3v2 tag
 	MaybePrecededById3(u32),
-	/// The stream starts with junk zero bytes
+	/// The stream starts with potential junk data
 	MaybePrecededByJunk,
-	/// The `FileType` could not be guessed
-	Undetermined,
 }
diff --git a/src/id3/v2/frame/header.rs b/src/id3/v2/frame/header.rs
@@ -70,6 +70,8 @@ where
 	let mut id_end = 4;
 	let mut invalid_v2_frame = false;
 	if header[3] == 0 && !synchsafe {
+		log::warn!("Found a v2 frame ID in a v3 tag, attempting to upgrade");
+
 		invalid_v2_frame = true;
 		id_end = 3;
 	}

diff --git a/src/id3/v2/frame/read.rs b/src/id3/v2/frame/read.rs
@@ -64,6 +64,8 @@ impl<'a> ParsedFrame<'a> {
 
 		// Get the encryption method symbol
 		if let Some(enc) = flags.encryption.as_mut() {
+			log::trace!("Reading encryption method symbol");
+
 			if size < 1 {
 				return Err(Id3v2Error::new(Id3v2ErrorKind::BadFrameLength).into());
 			}
@@ -74,6 +76,8 @@ impl<'a> ParsedFrame<'a> {
 
 		// Get the group identifier
 		if let Some(group) = flags.grouping_identity.as_mut() {
+			log::trace!("Reading group identifier");
+
 			if size < 1 {
 				return Err(Id3v2Error::new(Id3v2ErrorKind::BadFrameLength).into());
 			}
@@ -84,6 +88,8 @@ impl<'a> ParsedFrame<'a> {
 
 		// Get the real data length
 		if flags.data_length_indicator.is_some() || flags.compression {
+			log::trace!("Reading data length indicator");
+
 			if size < 4 {
 				return Err(Id3v2Error::new(Id3v2ErrorKind::BadFrameLength).into());
 			}

diff --git a/src/id3/v2/write/mod.rs b/src/id3/v2/write/mod.rs
@@ -48,7 +48,7 @@ pub(crate) fn write_id3v2<'a, I: Iterator<Item = FrameRef<'a>> + Clone + 'a>(
 
 	// Unable to determine a format
 	if file_type.is_none() {
-		err!(UnsupportedTag);
+		err!(UnknownFormat);
 	}
 
 	let file_type = file_type.unwrap();

diff --git a/src/mpeg/read.rs b/src/mpeg/read.rs
@@ -7,7 +7,7 @@ use crate::id3::v2::read::parse_id3v2;
 use crate::id3::{find_id3v1, find_lyrics3v2, ID3FindResults};
 use crate::macros::{decode_err, err};
 use crate::mpeg::header::HEADER_MASK;
-use crate::probe::ParseOptions;
+use crate::probe::{ParseOptions, ParsingMode};
 
 use std::io::{Read, Seek, SeekFrom};
 
@@ -32,6 +32,9 @@ where
 	while let Ok(()) = reader.read_exact(&mut header) {
 		match header {
 			// [I, D, 3, ver_major, ver_minor, flags, size (4 bytes)]
+			//
+			// Best case scenario, we find an ID3v2 tag at the beginning of the file.
+			// We will check again after finding the frame sync, in case the tag is buried in junk.
 			[b'I', b'D', b'3', ..] => {
 				// Seek back to read the tag in full
 				reader.seek(SeekFrom::Current(-4))?;
@@ -57,6 +60,8 @@ where
 
 				continue;
 			},
+			// TODO: APE tags may suffer the same issue as ID3v2 tag described above.
+			//       They are not nearly as important to preserve, however.
 			[b'A', b'P', b'E', b'T'] => {
 				log::warn!(
 					"Encountered an APE tag at the beginning of the file, attempting to read"
@@ -85,9 +90,50 @@ where
 				reader.seek(SeekFrom::Current(-1 * header.len() as i64))?;
 
 				#[allow(clippy::used_underscore_binding)]
-				if let Some((_first_first_header, _first_frame_offset)) = find_next_frame(reader)? {
+				if let Some((_first_frame_header, _first_frame_offset)) = find_next_frame(reader)? {
+					// TODO: We are manually searching through junk here, this could potentially be moved into `find_id3v2()`
+					if file.id3v2_tag.is_none()
+						&& parse_options.parsing_mode != ParsingMode::Strict
+						&& _first_frame_offset > 0
+					{
+						reader.seek(SeekFrom::Start(0))?;
+
+						let search_window_size =
+							std::cmp::min(_first_frame_offset, parse_options.max_junk_bytes as u64);
+						let mut id3v2_search_window = reader.take(search_window_size);
+
+						// TODO: A whole lot of code duplication here, its nearly identical to what we did above
+						if let Some(id3v2_offset) = find_id3v2_in_junk(&mut id3v2_search_window)? {
+							log::warn!(
+								"Found an ID3v2 tag preceded by junk data, offset: {}",
+								id3v2_offset
+							);
+
+							reader.seek(SeekFrom::Current(-3))?;
+
+							let header = Id3v2Header::parse(reader)?;
+							let skip_footer = header.flags.footer;
+
+							let id3v2 = parse_id3v2(reader, header, parse_options.parsing_mode)?;
+							if let Some(existing_tag) = &mut file.id3v2_tag {
+								// https://github.com/Serial-ATA/lofty-rs/issues/87
+								// Duplicate tags should have their frames appended to the previous
+								for frame in id3v2.frames {
+									existing_tag.insert(frame);
+								}
+								continue;
+							}
+
+							if skip_footer {
+								reader.seek(SeekFrom::Current(10))?;
+							}
+
+							file.id3v2_tag = Some(id3v2);
+						}
+					}
+
 					first_frame_offset = _first_frame_offset;
-					first_frame_header = Some(_first_first_header);
+					first_frame_header = Some(_first_frame_header);
 					break;
 				}
 			},
@@ -187,3 +233,25 @@ where
 
 	Ok(None)
 }
+
+/// Searches for an ID3v2 tag in (potential) junk data between the start
+/// of the file and the first frame
+fn find_id3v2_in_junk<R>(reader: &mut R) -> Result<Option<u64>>
+where
+	R: Read,
+{
+	let bytes = reader.bytes();
+
+	let mut id3v2_header = [0; 3];
+
+	for (index, byte) in bytes.enumerate() {
+		id3v2_header[0] = id3v2_header[1];
+		id3v2_header[1] = id3v2_header[2];
+		id3v2_header[2] = byte?;
+		if id3v2_header == *b"ID3" {
+			return Ok(Some((index - 2) as u64));
+		}
+	}
+
+	Ok(None)
+}
diff --git a/src/probe.rs b/src/probe.rs
@@ -542,7 +542,11 @@ impl<R: Read + Seek> Probe<R> {
 		}
 
 		// Guess the file type by using these 36 bytes
-		match FileType::from_buffer_inner(&buf[..buf_len]) {
+		let Some(file_type_guess) = FileType::from_buffer_inner(&buf[..buf_len]) else {
+			return Ok(None);
+		};
+
+		match file_type_guess {
 			// We were able to determine a file type
 			FileTypeGuessResult::Determined(file_ty) => Ok(Some(file_ty)),
 			// The file starts with an ID3v2 tag; this means other data can follow (e.g. APE or MP3 frames)