Files: 85166133f139a08a92d9d4a72ec96b266d9efbd1 / ggscrape
4929 bytesRaw
1 | |
2 | # vi: ts=4 sw=4 et |
3 | # |
4 | # ggscrape(1) |
5 | # Download emails from a Google Groups |
6 | # |
7 | # Copyright (c) 2014 Charles Lehner |
8 | # Released under the terms of the MIT License. |
9 | |
10 | VERSION=1.0.0 |
11 | BASE_URL='https://groups.google.com/forum/' |
12 | |
13 | EX_USAGE=64 |
14 | topic_range=100 |
15 | |
16 | opt_verbose= |
17 | group_id= |
18 | dest_dir= |
19 | ln_dir= |
20 | cookie_str= |
21 | |
22 | if [[ -n "$GG_COOKIE" ]]; then |
23 | cookie_str="$GG_COOKIE" |
24 | fi |
25 | |
26 | debug_print() { |
27 | if [[ -n $opt_verbose ]]; then |
28 | echo $@ >&2 |
29 | fi |
30 | } |
31 | |
32 | req() { |
33 | debug_print req: "$1" |
34 | curl -sN -b "$cookie_str" "$BASE_URL$@" |
35 | } |
36 | |
37 | req_fragment() { |
38 | req "?_escaped_fragment_=$@" |
39 | } |
40 | |
41 | check_permission() { |
42 | debug_print testing permission |
43 | if ! req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' |
44 | then |
45 | echo 'Unable to access group.' >&2 |
46 | return 1 |
47 | fi |
48 | } |
49 | |
50 | get_topics_single() { |
51 | local start=$1 |
52 | local end=$2 |
53 | |
54 | debug_print "get topics $group_id [$start-$end]" |
55 | |
56 | req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ |
57 | "s/^<i>Showing [^<]* of 0 topics<\/i>$//p; |
58 | s/<tr>/\0\n/; /lastPostDate/ { |
59 | s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D; |
60 | }; |
61 | /subject/{ |
62 | s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p; |
63 | }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/' |
64 | # return status 99 if no matches found |
65 | } |
66 | |
67 | unknown_option() { |
68 | echo "Unknown option $1" >&2 |
69 | exit $EX_USAGE |
70 | } |
71 | |
72 | get_topics() { |
73 | local start=$(($1)) |
74 | local end=$(($2)) |
75 | local temp_end |
76 | |
77 | if ((start==0)); then |
78 | start=1 |
79 | fi |
80 | |
81 | check_permission || return 1 |
82 | |
83 | debug_print "get all topics $group_id [$start-$end]" |
84 | |
85 | for ((i = start; i <= end || end == 0; i += topic_range)); do |
86 | ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1)) |
87 | get_topics_single $i $temp_end || break |
88 | done |
89 | } |
90 | |
91 | # get message ids in a topic |
92 | get_messages() { |
93 | local topic_id="$1" |
94 | req_fragment "topic/${group_id}/${topic_id}" | sed -n\ |
95 | 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p' |
96 | } |
97 | |
98 | download_message() { |
99 | local topic_id="$1" |
100 | local msg_id="$2" |
101 | debug_print download topic $topic_id message $msg_id |
102 | path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml" |
103 | if [[ -s "$path" ]]; then |
104 | echo "message ${topic_id}${msg_id} already downloaded. skipping." |
105 | else |
106 | echo "message ${topic_id}${msg_id} downloading." |
107 | temp=$(mktemp) |
108 | if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp" |
109 | then |
110 | mv "$temp" "$path" |
111 | if [[ -n "$ln_dir" ]]; then |
112 | ln "$path" "$ln_dir" |
113 | fi |
114 | else |
115 | echo "message ${topic_id}${msg_id} failed to download." >&2 |
116 | rm "$temp" |
117 | fi |
118 | fi |
119 | } |
120 | |
121 | download_messages() { |
122 | local topic_id="$1" |
123 | debug_print download topic $topic_id |
124 | get_messages "$topic_id" | while read msg_id _; do |
125 | download_message "$topic_id" "$msg_id" |
126 | done |
127 | } |
128 | |
129 | download_mails() { |
130 | local start=$1 |
131 | local end=$2 |
132 | |
133 | get_topics "$start" "$end" | while read topic_id topic_title; do |
134 | download_messages $topic_id |
135 | done |
136 | } |
137 | |
138 | show_version() { |
139 | echo ggscrape $VERSION |
140 | } |
141 | |
142 | show_help() { |
143 | cat << EOF |
144 | ggscrape. Download emails from a Google Group |
145 | |
146 | Usage: |
147 | ggscrape <group_id> test |
148 | ggscrape <group_id> topics |
149 | ggscrape <group_id> messages <topic_id> |
150 | ggscrape <group_id> download <dest_dir> |
151 | |
152 | Options: |
153 | -h, --help Show this screen |
154 | --version Show version |
155 | -v, --verbose Show debug info |
156 | -c, --cookie <cookie> Use the given cookie string |
157 | -b, --begin <topicnum> Topic number at which to begin downloading |
158 | -e, --end <topicnum> Topic number at which to stop downloading |
159 | -l, --ln <ln_dir> Hard link email files into this directory |
160 | |
161 | Environmental variables: |
162 | |
163 | GG_COOKIE use as value for --cookie |
164 | |
165 | EOF |
166 | } |
167 | |
168 | cmd= |
169 | topic_id= |
170 | topic_begin= |
171 | topic_end= |
172 | |
173 | while [[ "$#" -gt 0 ]]; do |
174 | case "$1" in |
175 | -h|--help) show_help; exit;; |
176 | --version) show_version; exit;; |
177 | -v|--verbose) opt_verbose=1;; |
178 | -c|--cookie) cookie_str="$2"; shift;; |
179 | -b|--begin) topic_begin="$2"; shift;; |
180 | -e|--end) topic_end="$2"; shift;; |
181 | -l|--ln) ln_dir="$2"; shift;; |
182 | topics) cmd=topics;; |
183 | test) cmd=test;; |
184 | messages) cmd=messages; topic_id="$2"; shift;; |
185 | download) cmd=download; dest_dir="$2"; shift;; |
186 | *) if [[ -z "$group_id" ]]; then |
187 | group_id="$1" |
188 | else |
189 | unknown_option "$1" |
190 | fi;; |
191 | esac |
192 | shift |
193 | done |
194 | |
195 | if [[ -z "$group_id" ]]; then |
196 | show_help |
197 | exit 1 |
198 | fi |
199 | |
200 | case "$cmd" in |
201 | '') show_help; exit;; |
202 | test) check_permission && echo Success;; |
203 | topics) get_topics "$topic_begin" "$topic_end";; |
204 | download) download_mails "$topic_begin" "$topic_end";; |
205 | messages) get_messages "$topic_id";; |
206 | *) echo "Unknown command $cmd" >&2;; |
207 | esac |
208 |
Built with git-ssb-web