git ssb

0+

cel / ggscrape



Tree: 1f4de2eeab8c60b37f065c88bd87aa3ba6c51e0d

Files: 1f4de2eeab8c60b37f065c88bd87aa3ba6c51e0d / ggscrape

4273 bytesRaw
1#!/bin/bash
2# vi: ts=4 sw=4 et
3#
4# ggscrape(1)
5# Download emails from a Google Groups
6#
7# Copyright (c) 2014 Charles Lehner
8# Released under the terms of the MIT License.
9
10VERSION=0.0.1
11BASE_URL='https://groups.google.com/forum/?_escaped_fragment_='
12
13EX_USAGE=64
14topic_range=100
15
16opt_verbose=
17group_id=
18dest_dir=
19cookie_str=
20
21if [[ -n "$GG_COOKIE" ]]; then
22 cookie_str="$GG_COOKIE"
23fi
24
25debug_print() {
26 if [[ -n $opt_verbose ]]; then
27 echo $@ >&2
28 fi
29}
30
31req() {
32 debug_print query: "$1"
33 curl -sN -b "$cookie_str" "$BASE_URL$@"
34}
35
36test_permission() {
37 debug_print testing permission
38 req "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK'
39}
40
41get_topics_single() {
42 local start=$1
43 local end=$2
44
45 debug_print "get topics $group_id [$start-$end]"
46
47 req "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \
48 "s/^<i>Showing [^<]* of 0 topics<\/i>$//p;
49 s/<tr>/\0\n/; /lastPostDate/ {
50 s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D;
51 };
52 /subject/{
53 s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p;
54 }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/'
55 # return status 99 if no matches found
56}
57
58unknown_option() {
59 echo "Unknown option $1" >&2
60 exit $EX_USAGE
61}
62
63get_topics() {
64 local start=$(($1))
65 local end=$(($2))
66 local temp_end
67
68 if [[ -z "$start" ]]; then
69 start=1
70 fi
71
72 debug_print "get all topics $group_id [$start-$end]"
73
74 for ((i = start; i <= end || end == 0; i += topic_range)); do
75 ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1))
76 get_topics_single $i $temp_end || break
77 done
78}
79
80# get message ids in a topic
81get_messages() {
82 local topic_id="$1"
83 req "topic/${group_id}/${topic_id}" | sed -n\
84 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p'
85}
86
87download_message() {
88 local topic_id="$1"
89 local msg_id="$2"
90 debug_print download topic $topic_id message $msg_id
91 path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml"
92 if [[ -s "$path" ]]; then
93 echo "file for message ${msg_id} already exists. skipping."
94 else
95 req "${group_id}/${topic_id}/${msg_id}" > "$path"
96 fi
97}
98
99download_messages() {
100 local topic_id="$1"
101 debug_print download topic $topic_id
102 get_messages "$topic_id" | while read msg_id _; do
103 download_message "$topic_id" "$msg_id"
104 done
105}
106
107download_mails() {
108 local start=$1
109 local end=$2
110
111 get_topics "$start" "$end" | while read topic_id topic_title; do
112 download_messages $topic_id
113 done
114}
115
116show_version() {
117 echo ggscrape $VERSION
118}
119
120show_help() {
121 cat << EOF
122ggscrape. Download emails from a Google Group
123
124Usage:
125 ggscrape <group_id> [test]
126 ggscrape <group_id> topics
127 ggscrape <group_id> messages <topic_id>
128 ggscrape <group_id> download <directory>
129
130Options:
131 -h, --help Show this screen
132 --version Show version
133 -v, --verbose Show debug info
134 -c, --cookie <cookie> Use the given cookie string
135 -b, --begin <topicnum> Topic number at which to begin downloading
136 -e, --end <topicnum> Topic number at which to stop downloading
137
138Environmental variables:
139
140 GG_COOKIE use as value for --cookie
141
142EOF
143}
144
145cmd=
146topic_id=
147topic_begin=
148topic_end=
149
150while [[ "$#" -gt 0 ]]; do
151 case "$1" in
152 -h|--help) show_help; exit;;
153 --version) show_version; exit;;
154 -v|--verbose) opt_verbose=1;;
155 -c|--cookie) cookie_str="$2"; shift;;
156 -b|--begin) topic_begin="$2"; shift;;
157 -e|--end) topic_end="$2"; shift;;
158 topics) cmd=topics;;
159 test) cmd=test;;
160 messages) cmd=messages; topic_id="$2"; shift;;
161 download) cmd=download; dest_dir="$2"; shift;;
162 *) if [[ -z "$group_id" ]]; then
163 group_id="$1"
164 else
165 unknown_option "$1"
166 fi;;
167 esac
168 shift
169done
170
171if [[ -z "$group_id" ]]; then
172 show_help
173 exit 1
174fi
175
176case "$cmd" in
177 '') show_help; exit;;
178 test) test_permission;;
179 topics) get_topics "$topic_begin" "$topic_end";;
180 download) download_mails "$topic_begin" "$topic_end";;
181 messages) get_messages "$topic_id";;
182 *) echo "Unknown command $cmd" >&2;;
183esac
184

Built with git-ssb-web