-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexport_blogger_comments.sh
executable file
·151 lines (135 loc) · 9.13 KB
/
export_blogger_comments.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env bash
# encoding: utf-8
PT_PATH="${PT_PATH:-"$(realpath "$(dirname "$0")/..")"}"
. "${PT_PATH}/lib/functions.sh"
domain="$(domain_from_url "$1")"
echo "Exporting Blogger blog at $domain"
blog_id=$("$PT_PATH/bin/get_blogger_id.sh" "$1")
complete_blog_data_file="$(ensure_path "data/output" "$domain.json")"
debug "Storing initial data into: $complete_blog_data_file"
echo '{"blog": {"id": "'$blog_id'", "posts": [], "post_urls": []}}' > "$complete_blog_data_file"
blogger_posts_json_files=$("$PT_PATH/bin/get_blogger_api_post_data_files.sh" "$blog_id")
while read -r blogger_posts_json_file; do
debug "\n==================================="
debug "Processing $blogger_posts_json_file"
data=$(jq '.items' "$blogger_posts_json_file")
"$PT_PATH/bin/add_items_to_complete_blog_data_file.sh" "$data" "$complete_blog_data_file" "blog,posts" "$blogger_posts_json_file"
# Call the script just to make sure the actual URLs files are also saved; a bit redundant perhaps?
# post_urls=$("$PT_PATH/bin/get_blogger_post_urls_from_api_post_data_file.sh" "$blogger_posts_json_file")
last_index="$(echo "$data" | jq 'length - 1')"
# last_index="1"
for i in $(seq 0 $last_index); do
debug "\n--------------------------------"
debug "Processing blogger post item [$i/$last_index]"
post_url="$(echo "$data" | jq -r --arg i "$i" '.[$i|tonumber] | .url')"
exit_code="$?"
if (( "$exit_code" >= 1 )); then
input="$(abort_if "a" "Cannot find URL item in data from '${blogger_post_json_file}'. (a)bort, (n)ext item? [a/N]")" && exit 255 || continue
fi
debug "Logging post url to .blog .post_urls: $post_url"
"$PT_PATH/bin/add_items_to_complete_blog_data_file.sh" "[\"$post_url\"]" "$complete_blog_data_file" "blog,post_urls" "$blogger_posts_json_file"
debug "-"
debug "Requesting GPlus Comments Widget for: $post_url"
gplus_widget=$("$PT_PATH/bin/request_gplus_comments_widget_for_url.sh" "$post_url")
exit_code="$?"
if (( "$exit_code" >= 1 )); then
input="$(abort_if "a" "Error while requesting GPlus Comments Widget for '$post_url' while processing '${blogger_post_json_file}'. (a)bort, (n)ext item? [a/N]")" && exit 255 || continue
fi
"$PT_PATH/bin/add_items_to_complete_blog_data_file.sh" "\"$gplus_widget\"" "$complete_blog_data_file" "blog,posts,$i,google_plus_comments_widget_file" "$blogger_posts_json_file"
debug "-"
debug "Retrieving GPlus API Activity IDs from GPlus Comments Widget stored at: $gplus_widget"
gplus_activity_ids="$("$PT_PATH/bin/get_gplus_api_activity_ids_from_gplus_comments_widget_file.sh" "$gplus_widget")"
debug "Activity IDs: $gplus_activity_ids"
activity_ids_data="$(printf "$gplus_activity_ids" | jq -nR '[inputs | select(length>0)]')"
"$PT_PATH/bin/add_items_to_complete_blog_data_file.sh" "$activity_ids_data" "$complete_blog_data_file" "blog,posts,$i,activity_ids" "$gplus_widget"
while read -r gplus_activity_id; do
debug "\n---"
if [ -z "$gplus_activity_id" -o "$gplus_activity_id" == "" ]; then
debug "=#= No activity IDs found in GPlus Comments Widget ('$gplus_widget'); maybe comments were made using Blogger's native comments?"
echo "'$post_url' -> '$gplus_widget'" >> "./logs/blogger-gplus-comments-widgets-without-gplus-api-activity-ids.txt"
continue
fi
debug "Requesting JSON from GPlus Activity API for $gplus_activity_id:"
gplus_activity_file="$("$PT_PATH/bin/get_gplus_api_activity_by_gplus_activity_id.sh" "$gplus_activity_id")"
exit_code="$?"
if (( "$exit_code" >= 1 )); then
input=$(abort_if "a" "Error while requesting JSON from GPlus Activity API for Activity with id '$gplus_activity_id' while processing '${blogger_post_json_file}'. (a)bort, (r)etry, (n)ext Activity item? [a/r/N]") && exit 255
if [ "$input" == 'r' -o "$input" == "R" ]; then
if [ -f "$gplus_activity_file" ]; then
rm "$gplus_activity_file"
fi
gplus_activity_file="$("$PT_PATH/bin/get_gplus_api_activity_by_gplus_activity_id.sh" "$gplus_activity_id")"
if (( "$?" >= 1 )); then
debug "=!= Failed again; continuing with next item."
continue
fi
else
continue
fi
fi
setxattr "activity_id" "$gplus_activity_id" "$gplus_activity_file" 1>&2
setxattr "widget_file" "$gplus_widget" "$gplus_activity_file" 1>&2
activity_data="$(cat "$gplus_activity_file" | jq -s)"
"$PT_PATH/bin/add_items_to_complete_blog_data_file.sh" "$activity_data" "$complete_blog_data_file" "blog,posts,$i,activities" "$gplus_activity_file"
debug "-"
debug "Checking if we need to request JSON from the GPlus Comments API for Activity '$gplus_activity_file' with ID $gplus_activity_id"
gplus_comments_file="$("$PT_PATH/bin/get_gplus_api_comments_by_gplus_activity_file.sh" "$gplus_activity_file")"
exit_code="$?"
if (( "$exit_code" >= 1 )); then
input=$(abort_if "a" "Error while requesting JSON from GPlus Comments API for Activity with id '$gplus_activity_id' while processing '${gplus_activity_file}'. (a)bort, (r)etry, (n)ext Activity item? [a/r/N]") && exit 255
if [ "$input" == 'r' -o "$input" == "R" ]; then
if [ -f "$gplus_comments_file" ]; then
rm "$gplus_comments_file"
fi
gplus_comments_file="$("$PT_PATH/bin/get_gplus_api_comments_by_gplus_activity_file.sh" "$gplus_activity_file")"
if (( "$?" >= 1 )); then
debug "=!= Failed again; continuing with next item."
continue
fi
else
continue
fi
fi
if [ -z "$gplus_comments_file" -o "$gplus_comments_file" == "" -o ! -f "$gplus_comments_file" ]; then
debug "=*= No GPlus Comments File: '$gplus_comments_file'"
continue
fi
setxattr "activity_id" "$gplus_activity_id" "$gplus_comments_file" 1>&2
setxattr "widget_file" "$gplus_widget" "$gplus_comments_file" 1>&2
setxattr "activity_file" "$gplus_activity_file" "$gplus_comments_file" 1>&2
comments_data="$(cat "$gplus_comments_file" | jq -s)"
# Add to the complete blog data file manually, since we require a target filter
tmp_file="${complete_blog_data_file}.$(timestamp "%Y%m%d-%H%M%S")"
cp "$complete_blog_data_file" "$tmp_file" &&\
cat "$complete_blog_data_file" | jq --argjson newItems "$comments_data" --arg postsIndex "$i" --arg activityId "$gplus_activity_id" '. as $input | $input | .blog .posts[$postsIndex|tonumber] .activities | map(.id == $activityId)|index(true) as $index | $input | .blog .posts[$postsIndex|tonumber] .activities[$index] .object .replies .comments += $newItems' > "${tmp_file}" &&\
mv "$tmp_file" "$complete_blog_data_file" || \
echo "'.blog .posts[$i] .activities[] | select(.id == $gplus_activity_id) | .object .replies .comments ' -> '$gplus_comments_file'" >> "./logs/failed-complete-blog-posts-adds.log"
exit_code="$?"
if (( "$exit_code" >= 1 )); then
debug "=!= [$exit_code] Error while adding comments to complete blog data file."
fi
#TODO: retrieve plusoners/resharers
# Get all self-links: $(cat "$complete_blog_data_file"|jq '. as $source | $source | [.blog .posts[0] .activities[]|path(..)|[.[]|tostring]|select(any(. == "selfLink"))|join(",")]|unique | .[] | [split(",")] as $selflinks | $source .blog .posts[0] .activities[] | [getpath($selflinks[])] | unique | add')
#TODO: retrieve attachments
#TODO: cache actors
#TODO: cache user avatars
#TODO: VERIFY that commentless posts are indeed without activities too...
# $(cat "$complete_blog_data_file" | jq '. as $source | $source | [.blog .posts[0] .activities[]|path(..)|[.[]|tostring]|select(.[-1] == "actor")|join(",")]|unique | .[] | [split(",")] as $selflinks | $source .blog .posts[0] .activities[] | [getpath($selflinks[])]')
done <<< "$gplus_activity_ids"
done
done <<< "$blogger_posts_json_files"
echo "$complete_blog_data_file"
# Elaborate piping example; will probably gradually be replaced with the stage-wise above approach, so the final results will be in a neat, complete JSON file.
# "$PT_PATH/bin/get_blogger_post_urls.sh" "$("$PT_PATH/bin/get_blogger_id.sh" "$1")" | xargs -L 1 "$PT_PATH/bin/request_gplus_comments_widget_for_url.sh" | xargs -L 1 "$PT_PATH/bin/get_gplus_api_activity_ids_from_gplus_comments_widget_file.sh" | xargs -L 1 "$PT_PATH/bin/get_gplus_api_activity_by_gplus_activity_id.sh" | xargs -L 1 "$PT_PATH/bin/get_gplus_api_comments_by_gplus_activity_file.sh"
# #FIXME: make sure the script does this
# mkdir -p "./data/output/$domain/html"
#
#
# #FIXME: Make it so that you aren't basically repeating all these lookups, even though they are cached...
# for filename in $(find "data/comments_frames/$domain/"* )
# do
# #FIXME: keep track of where you are, so you can abort, and continue again at a later time without having to restart.
# echo "$filename"
# echo $(basename "$filename")
# echo "$filename" | "$PT_PATH/bin/get_gplus_api_activity_ids_from_gplus_comments_widget_file.sh" | "$PT_PATH/bin/get_comments_from_google_plus_api_by_activity_id.sh" > "data/output/$domain/html/$(basename "$filename")"
# done