Initial garbage

This commit is contained in:
timothy 2021-11-11 16:04:00 +10:30
parent fe6db4ff54
commit da76dd91c4
2 changed files with 86 additions and 0 deletions

43
webtoon-scraper.sh Executable file
View File

@ -0,0 +1,43 @@
#!/bin/bash
# Dependencies: cut wget convert (imagemagic) openssl curl
# "error.log" will list all download and convert errors
# Set time file for sorting text
downloadtemp="/tmp/$(echo "$0" | rev | cut -d'/' -f1 | rev).$(openssl rand -hex 10).txt"
sortingtemp="/tmp/$(echo "$0" | rev | cut -d'/' -f1 | rev).$(openssl rand -hex 10).txt"
underscore="_" # Underscores are a vlaid character for variable names which messes up some of the naming. This is a dirty fix.
# Follow given url to final destination
url=$(curl -s -L -I -o /dev/null -w '%{url_effective}' "$1")
toonname=$(echo $url | cut -d'/' -f6)
episodeno=$(echo $url | cut -d'=' -f3)
eipsodename=$(echo $url | cut -d'/' -f7)
pictureno=0
mkdir -p "$toonname/temp"
wget --quiet -O "$downloadtemp" "$url"
if [ ! $? -eq 0 ]; then
echo -e "Error:\tURL: $url\t(404? This may not be an issue, check the output.)" >> "$toonname/error.log"
exit 1
fi
grep 'rel="nofollow" ondragstart="return false;" onselectstart="return false;" oncontextmenu="return false;"' "$downloadtemp" > "$sortingtemp"
echo -e "\nChapter URL: $url"
while read i; do
pictureno=$((pictureno+1))
imageurl=$(echo "$i" | cut -d'"' -f12 | cut -d'?' -f1) # Cut out the indervidual image urls
wget --no-verbose --tries=6 --user-agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0' --referer="http://www.webtoons.com/" $imageurl -O "$toonname/temp/$toonname$underscore$(printf %03d $episodeno)_$(printf %03d $pictureno).jpg" || echo "URL: $imageurl Name:$toonname/temp/$toonname$underscore$(printf %03d $episodeno)_$(printf %03d $pictureno).jpg" >> "$toonname/error.log"
done < "$sortingtemp"
# Remove the last image if it is the webtoon logo
mv "$toonname/temp/$toonname$underscore$(printf %03d $episodeno)_$(printf %03d $pictureno).jpg" "$toonname/temp/logo/$toonname$underscore$(printf %03d $episodeno)_$(printf %03d $pictureno).jpg"
# Don't bother doint this, as this will often result in an image larger than image magic can handle
#convert -quality 95 -append "$toonname/temp/$toonname$underscore$(printf %03d $episodeno)*.jpg" "$toonname/$toonname$underscore$(printf %03d $episodeno)_$eipsodename.jpg" || echo -e "convert.im6 error: Input: $toonname/temp/$toonname$underscore$(printf %03d $episodeno)\*.jpg\tOutput: $toonname/$toonname$underscore$(printf %03d $episodeno)_$eipsodename.jpg\tURL: $url" >> "$toonname/error.log"
# Remove temp file
rm "$sortingtemp" "$downloadtemp"

43
webtoon2html-segmented.sh Executable file
View File

@ -0,0 +1,43 @@
#!/bin/bash
underscore="_"
#indexentry() {
#
#}
episode_headders() {
echo '<!DOCTYPE html><html><head><title>'$(echo $img | cut -d'_' -f1,2)' - WebToon2HTML</title><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><style>img{display:block;}body{background-color:lightgrey;max-width:40em;font:1.1em/1.2em sans-serif;}h1,h2,h3{line-height:1.2em;}@media print{body{max-width:none}}</style></head><body><h2>'$(echo $img | cut -d'_' -f1,2)'</h2><hr>'
}
main () {
for img in *_???_???.jpg; do
if [ -f "$img" ]; then
if [ "$(echo $img | cut -d'_' -f2)" == "$current_episode" ]; then # Check to see if this isn't the first image of a new chapter.
echo "<img width=\"100%\" src=\"$img\">" >> "$(echo $img | cut -d'_' -f1,2).htm"
else # This is the first image for the chapter, do the initial setup.
# Put next and back arrows here
echo '</body></html>' >> "$(echo $img | cut -d'_' -f1)$underscore$current_episode.htm"
episode_headders > "$(echo $img | cut -d'_' -f1,2).htm"
echo "<img width=\"100%\" src=\"$img\">" >> "$(echo $img | cut -d'_' -f1,2).htm"
current_episode=$(echo $img | cut -d'_' -f2)
fi
fi
done
}
main
#main () {
# echo '<!DOCTYPE html><html><head>'"<title>${PWD##*/} - WebToon2HTML</title>"'<meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><style>img{display:block}body{background-color:lightgrey;max-width:40em;font:1.1em/1.2em sans-serif;}h1,h2,h3{line-height:1.2em;}@media print{body{max-width:none}}</style></head><body>'"<h2>${PWD##*/}</h2><hr><div class=\"nospace\">"
# for i in *.jpg; do
# echo "<img width=\"100%\" src=\"$i\">"
# done
# echo "</div><hr></html>"
#}