2008年8月1日金曜日

(Code) readnew

#!/bin/sh

## parameter
DLT=2
TIMEOUT=10

## program name
PROG=`basename $0`

## environment
LANG=ja_JP.UTF-8

## check command
LIST="url2dat mktemp curl parsedat"
for CMD in $LIST ;do
TEST_CMD=`which $CMD`
if [ -z "$TEST_CMD" ] ;then
echo "Error: $PROG: need command \"$CMD\"" >&2
exit 1
fi
done

## init
URL=`url2dat $1`
TEST_URL2DAT=$?
if [ "$TEST_URL2DAT" -ne 0 ] ;then
echo "Error: $PROG: url2dat ($TEST_URL2DAT)" >&2
exit 1
fi
echo "read new res from $URL"

DIR=`mktemp -d`
echo "tmp dir: $DIR"
FILE_HEAD=$DIR/head.txt
FILE=$DIR/dat.txt

curl -Ss -m $TIMEOUT -I $URL -o $FILE_HEAD
TEST_GETHEAD=$?
if [ "$TEST_GETHEAD" -ne 0 ] ;then
echo "Error: $PROG: fail getting initial header ($TEST_GETHEAD)" >&2
exit 1
fi
DATE=`grep 'Last-Modified: ' $FILE_HEAD |sed 's/.*: \(.*$\)/\1/'`
TIME_MOD=`date -d "$DATE" +%s`
curl -Ss -m $TIMEOUT --compressed $URL -o $FILE
TEST_INITGET=$?
if [ "$TEST_INITGET" -ne 0 ] ;then
echo "Error: $PROG: fail getting initial dat file ($TEST_INITGET)" >&2
exit 1
fi
N=`wc -l $FILE |awk '{print $1}'`
echo "got initial dat file ($N)"
echo

## body
ERROR_GET=0
FLG_NORANGE=0
while [ "$N" -lt 1000 ] ;do
curl -Ss -m $TIMEOUT -I $URL -o $FILE_HEAD
TEST_GETHEAD=$?
DATE=`grep 'Last-Modified: ' $FILE_HEAD |sed 's/.*: \(.*$\)/\1/'`
TIME_MOD_NEW=`date -d "$DATE" +%s`
if [ "$TEST_GETHEAD" -ne 0 ] ;then
## if fail getting header
ERROR_GET=$[$ERROR_GET + 1]
echo "Error: $PROG: fail getting dat's header ($TEST_GETHEAD)" >&2
echo " `date` ($ERROR_GET)" >&2
if [ "$ERROR_GET" -ge 5 ] ;then
echo "Error: $PROG: fail too much (5), stop"
exit 1
fi
else
## if success getting header
if [ "$TIME_MOD_NEW" -gt "$TIME_MOD" ] ;then
## if dat was renewed
echo "debug: renewed" >&2 ##d
if [ "$FLG_NORANGE" -eq 0 ] ;then
## use range
echo "debug: range" >&2 ##d
curl -s -m $TIMEOUT -C - $URL -o $FILE ## try range get
else
## use no range
echo "debug: no range" >&2 ##d
curl -s -m $TIMEOUT $URL -o $FILE
fi
TEST_GET=$?
if [ "$TEST_GET" -eq 33 ];then
## if range fail
echo "debug: range fail" >&2 ##d
#FLG_NORANGE=1
FLG_NORANGE=0 ##d
curl -s -m $TIMEOUT $URL -o $FILE
TEST_GET=$?
fi

if [ "$TEST_GET" -eq 18 ] ;then
## if success getting dat by range
echo "debug: success getting dat by range but no modified" >&2 ##d
:
elif [ "$TEST_GET" -eq 0 ] ;then
## if success getting dat
echo "debug: success getting dat" >&2 ##d
N_NEW=`wc -l $FILE |awk '{print $1}'`
if [ "$N_NEW" -gt "$N" ] ;then
echo "debug: new res found" >&2 ##d
sed -n $[$N+1],${N_NEW}p $FILE >$FILE.tmp
parsedat $FILE.tmp $N

N=$N_NEW
TIME_MOD=$TIME_MOD_NEW
fi
ERROR_GET=0
else
## if fail getting dat
echo "debug: fail getting dat" >&2 ##d
ERROR_GET=$[$ERROR_GET + 1]
echo "Error: $PROG: fail getting dat file ($TEST_GET)" >&2
echo " `date` ($ERROR_GET)" >&2
if [ "$ERROR_GET" -ge 5 ] ;then
echo "Error: $PROG: fail too much (5), stop"
exit 1
fi
fi
fi
sleep $DLT
fi
done
echo "reached 1000, stop"