git ssb

0+

cel / ggscrape



Tree: 8587a7ef0afc0ac2f7e6bf18c7f56c9b2d4eeec6

Files: 8587a7ef0afc0ac2f7e6bf18c7f56c9b2d4eeec6 / ggscrape

4798 bytesRaw
1#!/bin/bash
2# vi: ts=4 sw=4 et
3#
4# ggscrape(1)
5# Download emails from a Google Groups
6#
7# Copyright (c) 2014 Charles Lehner
8# Released under the terms of the MIT License.
9
10VERSION=0.0.1
11BASE_URL='https://groups.google.com/forum/'
12
13EX_USAGE=64
14topic_range=100
15
16opt_verbose=
17group_id=
18dest_dir=
19ln_dir=
20cookie_str=
21
22if [[ -n "$GG_COOKIE" ]]; then
23 cookie_str="$GG_COOKIE"
24fi
25
26debug_print() {
27 if [[ -n $opt_verbose ]]; then
28 echo $@ >&2
29 fi
30}
31
32req() {
33 debug_print req: "$1"
34 curl -sN -b "$cookie_str" "$BASE_URL$@"
35}
36
37req_fragment() {
38 req "?_escaped_fragment_=$@"
39}
40
41test_permission() {
42 debug_print testing permission
43 req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK'
44}
45
46get_topics_single() {
47 local start=$1
48 local end=$2
49
50 debug_print "get topics $group_id [$start-$end]"
51
52 req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \
53 "s/^<i>Showing [^<]* of 0 topics<\/i>$//p;
54 s/<tr>/\0\n/; /lastPostDate/ {
55 s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D;
56 };
57 /subject/{
58 s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p;
59 }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/'
60 # return status 99 if no matches found
61}
62
63unknown_option() {
64 echo "Unknown option $1" >&2
65 exit $EX_USAGE
66}
67
68get_topics() {
69 local start=$(($1))
70 local end=$(($2))
71 local temp_end
72
73 if ((start==0)); then
74 start=1
75 fi
76
77 debug_print "get all topics $group_id [$start-$end]"
78
79 for ((i = start; i <= end || end == 0; i += topic_range)); do
80 ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1))
81 get_topics_single $i $temp_end || break
82 done
83}
84
85# get message ids in a topic
86get_messages() {
87 local topic_id="$1"
88 req_fragment "topic/${group_id}/${topic_id}" | sed -n\
89 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p'
90}
91
92download_message() {
93 local topic_id="$1"
94 local msg_id="$2"
95 debug_print download topic $topic_id message $msg_id
96 path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml"
97 if [[ -s "$path" ]]; then
98 echo "message ${topic_id}${msg_id} already downloaded. skipping."
99 else
100 echo "message ${topic_id}${msg_id} downloading."
101 temp=$(mktemp)
102 if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp"
103 then
104 mv "$temp" "$path"
105 if [[ -n "$ln_dir" ]]; then
106 ln "$path" "$ln_dir"
107 fi
108 else
109 echo "message ${topic_id}${msg_id} failed to download." >&2
110 rm "$temp"
111 fi
112 fi
113}
114
115download_messages() {
116 local topic_id="$1"
117 debug_print download topic $topic_id
118 get_messages "$topic_id" | while read msg_id _; do
119 download_message "$topic_id" "$msg_id"
120 done
121}
122
123download_mails() {
124 local start=$1
125 local end=$2
126
127 get_topics "$start" "$end" | while read topic_id topic_title; do
128 download_messages $topic_id
129 done
130}
131
132show_version() {
133 echo ggscrape $VERSION
134}
135
136show_help() {
137 cat << EOF
138ggscrape. Download emails from a Google Group
139
140Usage:
141 ggscrape <group_id> [test]
142 ggscrape <group_id> topics
143 ggscrape <group_id> messages <topic_id>
144 ggscrape <group_id> download <dest_dir>
145
146Options:
147 -h, --help Show this screen
148 --version Show version
149 -v, --verbose Show debug info
150 -c, --cookie <cookie> Use the given cookie string
151 -b, --begin <topicnum> Topic number at which to begin downloading
152 -e, --end <topicnum> Topic number at which to stop downloading
153 -l, --ln <ln_dir> Hard link email files into this directory
154
155Environmental variables:
156
157 GG_COOKIE use as value for --cookie
158
159EOF
160}
161
162cmd=
163topic_id=
164topic_begin=
165topic_end=
166
167while [[ "$#" -gt 0 ]]; do
168 case "$1" in
169 -h|--help) show_help; exit;;
170 --version) show_version; exit;;
171 -v|--verbose) opt_verbose=1;;
172 -c|--cookie) cookie_str="$2"; shift;;
173 -b|--begin) topic_begin="$2"; shift;;
174 -e|--end) topic_end="$2"; shift;;
175 -l|--ln) ln_dir="$2"; shift;;
176 topics) cmd=topics;;
177 test) cmd=test;;
178 messages) cmd=messages; topic_id="$2"; shift;;
179 download) cmd=download; dest_dir="$2"; shift;;
180 *) if [[ -z "$group_id" ]]; then
181 group_id="$1"
182 else
183 unknown_option "$1"
184 fi;;
185 esac
186 shift
187done
188
189if [[ -z "$group_id" ]]; then
190 show_help
191 exit 1
192fi
193
194case "$cmd" in
195 '') show_help; exit;;
196 test) test_permission;;
197 topics) get_topics "$topic_begin" "$topic_end";;
198 download) download_mails "$topic_begin" "$topic_end";;
199 messages) get_messages "$topic_id";;
200 *) echo "Unknown command $cmd" >&2;;
201esac
202

Built with git-ssb-web