Remove HTTP headers from gzip or zip on stdin yy054 (revised)
.
/* remove HTTP headers from multiple gzip or single zip from stdin */
int fileno (FILE *);
int setenv (const char *, const char *, int);
#define jmp (yy_start) = 1 + 2 *
int show_headers;
%option nounput noinput noyywrap
%%
HTTP\/[01]\.[\15\12\40-\176]{0,1024}\r\n\r\n if(show_headers){fwrite(yytext,1,yyleng,yyout);putc(10,yyout);};
.|\n if(!show_headers)fwrite(yytext,1,yyleng,yyout);
%%
int main(int argc,char *argv[])
{
if(argc)if(argv[0])
if(argv[1])show_headers++;
yylex();
exit(0);
} Correction:
/* remove HTTP headers from multiple gzip or single zip from stdin */
int fileno (FILE *);
int setenv (const char *, const char *, int);
#define jmp (yy_start) = 1 + 2 *
int x;
%option nounput noinput noyywrap
%%
HTTP\/[\40-\176]+\x0d\x0a x++;
[\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
\x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
%%
int main()
{
yylex();
exit(0);
}
Usage example:Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.
ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
gzip -dc robotstxt.paths.gz \
|head -5 \
|sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
$!s/$/keep-alive[]/;$s/$/close[]/' \
|tr [] '\r\n' \
|openssl s_client -quiet -connect data.commoncrawl.org:443 \
|yy054 \
|zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
exec cat 1.txtUsage example:
Download NetBSD 1.0 in a single TCP connection.
y="GET /pub/NetBSD-archive/NetBSD-1.0/source/src10/"
z="Host: archive.netbsd.org"
sed '$!s>.*>'"$y"'& HTTP/1.1[]'"$z"'[]Connection: keep-alive[]>;
$s>.*>'"$y"'& HTTP/1.0[]'"$z"'[]>' << eof \
|tr '[]' '\r\n' \
|openssl s_client -quiet -connect 151.101.129.6:443 -servername archive.netbsd.org > http+gzip
src10.aa
src10.ab
src10.ac
src10.ad
src10.ae
src10.af
src10.ag
src10.ah
src10.ai
src10.aj
src10.ak
src10.al
src10.am
src10.an
src10.ao
src10.ap
src10.aq
src10.ar
src10.as
src10.at
src10.au
src10.av
src10.aw
src10.ax
src10.ay
src10.az
src10.ba
src10.bb
src10.bc
src10.bd
src10.be
src10.bf
eof
yy054 < http+gzip|tar tvzf /dev/stdin
Alternate usage:Include an argv[1] will print HTTP headers only
yy054 print < http+gzip
yy054 x < http+gzip