-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
102 lines (85 loc) · 2.78 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
"""
Main script for Wikipedia Article Archiver.
This script archives Wikipedia articles from specified categories.
"""
import sys
import os
import argparse
from datetime import datetime
# Add the project root to Python path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from wiki_archiver.config import *
from wiki_archiver.core import WikiArchiver
from wiki_archiver.logging import logger
def parse_arguments():
"""
Parse command-line arguments.
Returns:
argparse.Namespace: Parsed arguments
"""
parser = argparse.ArgumentParser(description="Wikipedia Article Archiver")
parser.add_argument(
"-c", "--categories",
nargs='+', # Allow multiple categories
default=CATEGORIES,
help="Wikipedia categories to archive (space-separated)"
)
parser.add_argument(
"-l", "--language",
default=LANGUAGE,
help="Wikipedia language"
)
parser.add_argument(
"-d", "--depth",
type=int,
default=MAX_DEPTH,
help="Maximum category recursion depth"
)
parser.add_argument(
"-o", "--output",
default=OUTPUT_DIR,
help="Output directory for archived articles"
)
return parser.parse_args()
def main():
"""
Main function to orchestrate Wikipedia article archiving.
"""
try:
# Parse arguments
args = parse_arguments()
# Validate configuration
from wiki_archiver.config import validate_config
if not validate_config():
logger.error("Invalid configuration. Exiting.")
sys.exit(1)
# Initialize archiver
archiver = WikiArchiver(
language=args.language,
categories=args.categories,
max_depth=args.depth
)
# Load previous progress
previous_progress = archiver.load_progress()
logger.info(f"Loaded previous progress: {previous_progress}")
# Start archiving
start_time = datetime.now()
logger.info(f"Starting archive for categories: {args.categories}")
archiver.scrape_categories() # Use new method for multiple categories
# Save progress
archiver.save_progress()
# Log completion
end_time = datetime.now()
duration = end_time - start_time
logger.info(f"Archiving completed in {duration}")
except KeyboardInterrupt:
logger.warning("Archiving interrupted by user.")
except Exception as e:
logger.error(f"Unexpected error: {e}")
sys.exit(1)
finally:
# Cleanup and final logging
logger.info("Wikipedia Article Archiver finished.")
if __name__ == "__main__":
main()