git ssb

0+

cel / ggscrape



Tree: cd186e29c07f78383647c50e097dffa61d51ebbb

Files: cd186e29c07f78383647c50e097dffa61d51ebbb / ggscrape

5837 bytesRaw
1#!/bin/bash
2# vi: ts=4 sw=4 et
3#
4# ggscrape(1)
5# Download emails from a Google Groups
6#
7# Copyright (c) 2014 Charles Lehner
8# Released under the terms of the MIT License.
9
10VERSION=1.0.0
11BASE_URL='https://groups.google.com/forum/'
12
13EX_USAGE=64
14topic_range=100
15
16opt_verbose=
17group_id=
18dest_dir=
19ln_dir=
20cookie_str=
21
22if [[ -n "$GG_COOKIE" ]]; then
23 cookie_str="$GG_COOKIE"
24fi
25
26debug_print() {
27 if [[ -n $opt_verbose ]]; then
28 echo $@ >&2
29 fi
30}
31
32req() {
33 debug_print req: "$1"
34 curl -sN -b "$cookie_str" "$BASE_URL$@"
35}
36
37req_fragment() {
38 req "?_escaped_fragment_=$@"
39}
40
41check_permission() {
42 debug_print testing permission
43 if ! req_fragment "forum/${group_id}%5B1-1-false%5D" -I | sed 1q | grep -q 200
44 then
45 echo 'Unable to access group.' >&2
46 return 1
47 fi
48}
49
50get_topics_single() {
51 local start=$1
52 local end=$2
53
54 debug_print "get topics $group_id [$start-$end]"
55
56 req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \
57 "s/^<i>Showing [^<]* of 0 topics<\/i>$//p;
58 s/<tr>/\0\n/; /lastPostDate/ {
59 s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D;
60 };
61 /subject/{
62 s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p;
63 }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/'
64 # return status 99 if no matches found
65}
66
67unknown_option() {
68 echo "Unknown option $1" >&2
69 exit $EX_USAGE
70}
71
72get_topics() {
73 local start=$(($1))
74 local end=$(($2))
75 local temp_end
76
77 if ((start==0)); then
78 start=1
79 fi
80
81 check_permission || return 1
82
83 debug_print "get all topics $group_id [$start-$end]"
84
85 for ((i = start; i <= end || end == 0; i += topic_range)); do
86 ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1))
87 get_topics_single $i $temp_end || break
88 done
89}
90
91# get message ids in a topic
92get_messages() {
93 local topic_id="$1"
94 req_fragment "topic/${group_id}/${topic_id}" | sed -n\
95 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p'
96}
97
98fix_message() {
99 local file="$1"
100 # Some messages have two duplicate sets of headers, the second set starting
101 # with some X-Google stuff. Remove the second set of headers if they are
102 # present.
103 if sed -n '/^\r$/{ n; /^X-Google-Groups:/q; q 1; }' "$file"; then
104 debug_print 'Removing duplicate headers'
105 sed -i~ '/^X-Google-Groups:/,/^\r$/d' "$file"
106 fi
107 # Add missing multipart/alternative boundary
108 sed -i~ '/^--[^< ]*$/{ n; /^--[^< ]*$/{ s/^--\(.*\)/Content-Type: multipart\/alternative; boundary=\1\n\n\0/; }; }' "$file"
109 rm "$file~"
110}
111
112download_message() {
113 local topic_id="$1"
114 local msg_id="$2"
115 debug_print download topic $topic_id message $msg_id
116 path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml"
117 if [[ -s "$path" ]]; then
118 echo "message ${topic_id}${msg_id} already downloaded. skipping."
119 else
120 echo "message ${topic_id}${msg_id} downloading."
121 temp=$(mktemp)
122 if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp"
123 then
124 fix_message "$temp"
125 mv "$temp" "$path"
126 if [[ -n "$ln_dir" ]]; then
127 ln "$path" "$ln_dir"
128 fi
129 else
130 echo "message ${topic_id}${msg_id} failed to download." >&2
131 rm "$temp"
132 fi
133 fi
134}
135
136download_messages() {
137 local topic_id="$1"
138 debug_print download topic $topic_id
139 get_messages "$topic_id" | while read msg_id _; do
140 download_message "$topic_id" "$msg_id"
141 done
142}
143
144download_mails() {
145 local start=$1
146 local end=$2
147
148 mkdir -p "$dest_dir" || exit 1
149 get_topics "$start" "$end" | while read topic_id topic_title; do
150 download_messages $topic_id
151 done
152}
153
154show_version() {
155 echo ggscrape $VERSION
156}
157
158show_help() {
159 cat << EOF
160ggscrape. Download emails from a Google Group
161
162Usage:
163 ggscrape <group_id> test
164 ggscrape <group_id> topics
165 ggscrape <group_id> messages <topic_id>
166 ggscrape <group_id> download <dest_dir>
167 ggscrape fix_message <file>
168
169Options:
170 -h, --help Show this screen
171 --version Show version
172 -v, --verbose Show debug info
173 -c, --cookie <cookie> Use the given cookie string
174 -b, --begin <topicnum> Topic number at which to begin downloading
175 -e, --end <topicnum> Topic number at which to stop downloading
176 -l, --ln <ln_dir> Hard link email files into this directory
177
178Environmental variables:
179
180 GG_COOKIE use as value for --cookie
181
182EOF
183}
184
185cmd=
186topic_id=
187topic_begin=
188topic_end=
189fix_file=
190
191while [[ "$#" -gt 0 ]]; do
192 case "$1" in
193 -h|--help) show_help; exit;;
194 --version) show_version; exit;;
195 -v|--verbose) opt_verbose=1;;
196 -c|--cookie) cookie_str="$2"; shift;;
197 -b|--begin) topic_begin="$2"; shift;;
198 -e|--end) topic_end="$2"; shift;;
199 -l|--ln) ln_dir="$2"; shift;;
200 topics) cmd=topics;;
201 test) cmd=test;;
202 messages) cmd=messages; topic_id="$2"; shift;;
203 download) cmd=download; dest_dir="$2"; shift;;
204 fix_message) cmd=fix_message; fix_file="$2"; shift;;
205 *) if [[ -z "$group_id" ]]; then
206 group_id="$1"
207 else
208 unknown_option "$1"
209 fi;;
210 esac
211 shift
212done
213
214if [[ "$cmd" == fix_message ]]; then
215 if [[ -z "$fix_file" ]]; then
216 show_help
217 exit 1
218 fi
219 fix_message "$fix_file"
220 exit
221fi
222
223if [[ -z "$group_id" ]]; then
224 show_help
225 exit 1
226fi
227
228case "$cmd" in
229 '') show_help; exit;;
230 test) check_permission && echo Success;;
231 topics) get_topics "$topic_begin" "$topic_end";;
232 download) download_mails "$topic_begin" "$topic_end";;
233 messages) get_messages "$topic_id";;
234 *) echo "Unknown command $cmd" >&2;;
235esac
236

Built with git-ssb-web