diff --git a/stattic.py b/stattic.py index 45eace0..1aec26d 100644 --- a/stattic.py +++ b/stattic.py @@ -141,20 +141,24 @@ def minify_assets(self): except Exception as e: self.logger.error(f"Failed to minify assets: {e}") - def format_date(self, date_str): + def format_date(self, date_str=None): """Format the date from 'YYYY-MM-DDTHH:MM:SS' to 'Month DD, YYYY'.""" - try: - # If date_str is already a datetime.date or datetime.datetime object, format it directly - if isinstance(date_str, (datetime, datetime.date)): - return date_str.strftime('%B %d, %Y') + if not date_str: + return '' # Return an empty string if no date is provided - # Parse the input string to a datetime object if it's a string + try: + # Attempt to parse and format the date with time date_obj = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - return date_obj.strftime('%B %d, %Y') + except ValueError: + try: + # Fallback to parsing date without time + date_obj = datetime.strptime(date_str, '%Y-%m-%d') + except ValueError: + # If parsing fails, return the original date string + return date_str - except (ValueError, TypeError): - # Return the original string if formatting fails or if it's not a string/datetime - return date_str + # Return the formatted date + return date_obj.strftime('%b %d, %Y') def load_categories_and_tags(self): """Load categories and tags from YAML files.""" @@ -192,24 +196,25 @@ def load_pages(self): filepath = os.path.join(self.pages_dir, page_file) metadata, _ = self.parse_markdown_with_metadata(filepath) - # Fix title extraction if it's a dictionary with 'rendered' title = metadata.get('title', 'Untitled') if isinstance(title, dict): title = title.get('rendered', 'Untitled') - # Get the order from frontmatter or default to a high number for unordered pages order = metadata.get('order', 100) + + # Convert nav_hide to lowercase and treat as a string + nav_hide = str(metadata.get('nav_hide', '')).strip().lower() # Add page metadata to self.pages self.pages.append({ 'title': title, 'permalink': f"/{metadata.get('custom_url', page_file.replace('.md', ''))}/", - 'order': order + 'order': order, + 'nav_text': metadata.get('nav_text'), + 'nav_hide': nav_hide # Store it consistently as lowercase }) - # Sort pages by the order field self.pages = sorted(self.pages, key=lambda x: x['order']) - self.logger.info(f"Loaded {len(self.pages)} pages for navigation") except Exception as e: @@ -218,6 +223,11 @@ def load_pages(self): def download_image(self, url, output_dir, markdown_file_path=None): """Download an image and save it locally, or check if it's a local image.""" try: + # Only process URLs with common image file extensions + if not url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff')): + self.logger.warning(f"Skipping non-image URL: {url}") + return None + # Check if the URL is relative (starts with a slash or '..' indicating local reference) if url.startswith('/') or url.startswith('../') or not url.startswith('http'): # If markdown_file_path is provided, resolve the local path relative to it @@ -380,11 +390,32 @@ def convert_image_to_webp(self, image_path): return None def process_images(self, content): - """Find all image URLs in the content, download, and convert them.""" - image_urls = re.findall(r'!\[.*?\]\((.*?)\)', content) # Extract image URLs from markdown + """Find all image URLs in the content, download, convert them, and replace with local WebP paths.""" + # Extract image URLs from Markdown syntax + markdown_image_urls = re.findall(r'!\[.*?\]\((.*?)\)', content) + + # Extract image URLs from HTML tags, including src, srcset, and wrapped links + html_image_urls = re.findall(r']*src="([^"]+)"', content) + href_urls = re.findall(r']*href="([^"]+)"', content) + + # Extract srcset image URLs, multiple URLs per srcset + srcset_urls = re.findall(r'srcset="([^"]+)"', content) + all_srcset_urls = [] + for srcset in srcset_urls: + all_srcset_urls.extend([url.strip().split(' ')[0] for url in srcset.split(',')]) + + # Combine all unique image URLs + image_urls = set(markdown_image_urls + html_image_urls + href_urls + all_srcset_urls) + local_image_paths = {} - + + # Process all unique image URLs found for url in image_urls: + # Ensure the URL points to an image file + if not url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff')): + self.logger.warning(f"Skipping non-image URL: {url}") + continue + self.logger.info(f"Processing image: {url}") image_name = os.path.basename(url) webp_image_path = os.path.join(self.images_dir, image_name.rsplit('.', 1)[0] + '.webp') @@ -401,9 +432,31 @@ def process_images(self, content): if webp_path: local_image_paths[url] = os.path.join('/images', os.path.basename(webp_path)) - # Replace external URLs with local WebP paths in the content + # Replace `href` and `src` attributes directly for url, webp_path in local_image_paths.items(): - content = content.replace(url, webp_path) + content = content.replace(f'href="{url}"', f'href="{webp_path}"') + content = content.replace(f'src="{url}"', f'src="{webp_path}"') + + # Replace `srcset` attributes with all processed image URLs + def replace_srcset(match): + srcset_value = match.group(1) + srcset_entries = srcset_value.split(',') + + # Prepare to replace each URL in the srcset + new_srcset_entries = [] + for entry in srcset_entries: + parts = entry.strip().split(' ') + url_part = parts[0] + # Check if the URL was processed and exists in local_image_paths + if url_part in local_image_paths: + parts[0] = local_image_paths[url_part] + new_srcset_entries.append(' '.join(parts)) + + # Reconstruct the srcset attribute with all updated URLs + return 'srcset="' + ', '.join(new_srcset_entries) + '"' + + # Use regex to find srcset attributes and replace them using the function + content = re.sub(r'srcset="([^"]+)"', replace_srcset, content) return content @@ -467,7 +520,7 @@ def build_post_or_page(self, metadata, html_content, post_slug, output_dir, is_p author_name = self.get_author_name(metadata.get('author', 'Unknown')) # Format the date using the format_date helper function - formatted_date = self.format_date(metadata.get('date', '')) + formatted_date = self.format_date(metadata.get('date')) post_categories = [] for cat_id in metadata.get('categories', []): @@ -577,7 +630,7 @@ def build_posts_and_pages(self): 'title': metadata.get('title', 'Untitled'), 'excerpt': self.markdown_filter(metadata.get('excerpt', self.generate_excerpt(md_content))), 'permalink': f"/blog/{post_slug}/", - 'date': metadata.get('date', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + 'date': self.format_date(metadata.get('date')) } self.posts.append(post_metadata) @@ -613,26 +666,25 @@ def generate_excerpt(self, content): def build_index_page(self): """Render and build the index (homepage) with the list of posts.""" try: - # Helper function to convert all date objects to datetime for consistent comparison - def get_post_date(post): - post_date = post.get('date', None) - if post_date: - # If post_date is a date object (without time), convert it to datetime - if isinstance(post_date, date) and not isinstance(post_date, datetime): - return datetime.combine(post_date, datetime.min.time()) # Add time component to date - # If it's already a datetime, return as is - elif isinstance(post_date, datetime): - return post_date - # If it's a string, attempt to parse it - elif isinstance(post_date, str): - try: - return datetime.strptime(post_date, "%Y-%m-%dT%H:%M:%S") - except ValueError: - self.logger.warning(f"Invalid date format in post: {post['title']} - using default date.") - return datetime.min # Return a minimal date if parsing fails - return datetime.min # Use a minimal date if date is missing + def parse_date(date_str): + """Try parsing the date with different possible formats.""" + for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d', '%b %d, %Y']: + try: + return datetime.strptime(date_str, fmt) + except ValueError: + continue + self.logger.warning(f"Invalid date format: '{date_str}', defaulting to minimum datetime") + return datetime.min # Default to minimum datetime if no formats match - # Ensure that all dates are converted before sorting + def get_post_date(post): + date_str = post.get('date', '') + if isinstance(date_str, datetime): + return date_str + elif isinstance(date_str, str): + return parse_date(date_str) + return datetime.min # Default to minimum datetime if date is missing or invalid + + # Sort posts by date in descending order posts_for_index = sorted(self.posts, key=get_post_date, reverse=True)[:self.posts_per_page] # Render the index.html template with the list of posts and pages for the menu @@ -715,7 +767,7 @@ def generate_rss_feed(self, site_url, site_name=None): # Fallback to the current date if parsing fails post_pubdate = datetime.now().strftime('%a, %d %b %Y %H:%M:%S +0000') except ValueError: - post_pubdate = datetime.strptime(post_date_str, '%Y-%m-%d %H:%M:%S').strftime('%a, %d %b %Y %H:%M:%S +0000') + post_pubdate = self.format_date(post.get('date')) # Generate a unique guid for each post (could be permalink-based hash) guid = md5(post_permalink.encode('utf-8')).hexdigest() @@ -766,13 +818,20 @@ def generate_xml_sitemap(self, site_url): for post in self.posts: post_permalink = f"{site_url.rstrip('/')}/{post.get('permalink', '').lstrip('/')}" post_date_str = post.get('date', datetime.now()) - - # Ensure post_date is a datetime object + + # Try multiple formats for the post date if isinstance(post_date_str, str): - try: - post_date = datetime.strptime(post_date_str, '%Y-%m-%d %H:%M:%S') - except ValueError: - post_date = datetime.strptime(post_date_str, '%Y-%m-%dT%H:%M:%S') + date_formats = ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d', '%b %d, %Y'] + post_date = None + for fmt in date_formats: + try: + post_date = datetime.strptime(post_date_str, fmt) + break + except ValueError: + continue + if post_date is None: + self.logger.error(f"Date '{post_date_str}' could not be parsed with any known format. Using current date.") + post_date = datetime.now() elif isinstance(post_date_str, datetime): post_date = post_date_str else: @@ -788,9 +847,9 @@ def generate_xml_sitemap(self, site_url): # Generate the full XML sitemap content sitemap_xml_content = f""" - - {''.join(sitemap_entries)} - + + {''.join(sitemap_entries)} + """ # Write the XML sitemap to output/sitemap.xml @@ -807,22 +866,35 @@ def format_xml_sitemap_entry(self, url, lastmod): """Format a URL entry for the XML sitemap.""" escaped_url = escape(url) - # Ensure lastmod is a datetime and format it accordingly + # If lastmod is already a datetime, convert to the desired format if isinstance(lastmod, datetime): - lastmod = lastmod.strftime('%Y-%m-%dT%H:%M:%SZ') + lastmod_str = lastmod.strftime('%Y-%m-%dT%H:%M:%SZ') elif isinstance(lastmod, str): - try: - # Attempt to parse the string to a datetime object - lastmod = datetime.strptime(lastmod, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%dT%H:%M:%SZ') - except ValueError: - lastmod = datetime.strptime(lastmod, '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%dT%H:%M:%SZ') + # Attempt each format until one is successful + date_formats = ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d', '%b %d, %Y'] + lastmod_str = None + for fmt in date_formats: + try: + lastmod_dt = datetime.strptime(lastmod, fmt) + lastmod_str = lastmod_dt.strftime('%Y-%m-%dT%H:%M:%SZ') + self.logger.info(f"Successfully parsed date '{lastmod}' with format '{fmt}'") + break + except ValueError as e: + self.logger.debug(f"Failed to parse date '{lastmod}' with format '{fmt}': {e}") + + # If no format matches, log the fallback + if lastmod_str is None: + self.logger.error(f"Date '{lastmod}' could not be parsed with any known format. Using current date instead.") + lastmod_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + else: + lastmod_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') return f''' - - {escaped_url} - {escape(lastmod)} - - ''' + + {escaped_url} + {lastmod_str} + + ''' def build_404_page(self): """Build and generate the 404 error page for GitHub Pages."""