Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
KT
EUR-Lex Scraper
Commits
2e715679
Commit
2e715679
authored
Jun 12, 2015
by
Miha
Browse files
Now able to scrape procedure meta-data
parent
774d2f5d
Changes
2
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
2e715679
...
...
@@ -2,4 +2,5 @@ bin
obj
*.suo
*.bak
Data
\ No newline at end of file
Documents
Procedures
\ No newline at end of file
Program.cs
View file @
2e715679
...
...
@@ -22,56 +22,102 @@ namespace EurLexScraper
}
}
enum
Mode
{
Documents
,
Procedures
}
static
void
Main
(
string
[]
args
)
{
Mode
mode
=
Mode
.
Procedures
;
using
(
var
driver
=
new
ChromeDriver
())
{
for
(
int
year
=
2010
;
year
>=
2000
;
year
--)
if
(
mode
==
Mode
.
Documents
)
{
for
(
int
year
=
2005
;
year
>=
2000
;
year
--)
{
Console
.
WriteLine
(
"Fetching data for year {0} ..."
,
year
);
driver
.
Navigate
().
GoToUrl
(
string
.
Format
(
"http://eur-lex.europa.eu/search.html?CASE_LAW_SUMMARY=false&DTS_DOM=ALL&type=advanced&SUBDOM_INIT=ALL_ALL&DTS_SUBDOM=ALL_ALL&DD_YEAR={0}&page=1&locale=en&sortOne=DD&sortOneOrder=asc"
,
year
));
driver
.
FindElementByXPath
(
"//a[text()='Change displayed metadata']"
).
Click
();
WaitFor
(()
=>
{
driver
.
FindElementById
(
"nbResultPerPage"
);
driver
.
FindElementByXPath
(
"//a[text()='Advanced selection']"
);
driver
.
FindElementById
(
"advanced_c0textCommon"
);
driver
.
FindElementById
(
"advanced_c1datesCommon"
);
driver
.
FindElementById
(
"advanced_c2technicalMetCommon"
);
driver
.
FindElementById
(
"advanced_c3otherSearchCritCommon"
);
driver
.
FindElementById
(
"advanced_c0descriptors"
);
driver
.
FindElementById
(
"advanced_c1bibDetails"
);
driver
.
FindElementById
(
"advanced_c2dates"
);
driver
.
FindElementById
(
"advanced_c3relBetweenDocs"
);
driver
.
FindElementById
(
"advanced_c4caseLawParties"
);
driver
.
FindElementById
(
"advanced_c5caseLawOtherData"
);
driver
.
FindElementById
(
"advanced_c6otherSearchCrit"
);
driver
.
FindElementById
(
"button.apply"
);
return
true
;
});
driver
.
FindElementById
(
"nbResultPerPage"
).
SendKeys
(
"20"
);
driver
.
FindElementByXPath
(
"//a[text()='Advanced selection']"
).
Click
();
// select *all* metadata
driver
.
FindElementById
(
"advanced_c0textCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c1datesCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c2technicalMetCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c3otherSearchCritCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c0descriptors"
).
Click
();
driver
.
FindElementById
(
"advanced_c1bibDetails"
).
Click
();
driver
.
FindElementById
(
"advanced_c2dates"
).
Click
();
driver
.
FindElementById
(
"advanced_c3relBetweenDocs"
).
Click
();
driver
.
FindElementById
(
"advanced_c4caseLawParties"
).
Click
();
driver
.
FindElementById
(
"advanced_c5caseLawOtherData"
).
Click
();
driver
.
FindElementById
(
"advanced_c6otherSearchCrit"
).
Click
();
driver
.
FindElementById
(
"button.apply"
).
Click
();
int
pg
=
0
;
Directory
.
CreateDirectory
(
string
.
Format
(
@"C:\Work\EurLexScraper\Documents\{0}"
,
year
));
while
(
true
)
{
driver
.
Navigate
().
GoToUrl
(
Regex
.
Replace
(
driver
.
Url
,
@"page=\d+"
,
"page="
+
++
pg
));
// check if we're done
if
(!
Regex
.
Match
(
driver
.
Url
,
@"page=\d+"
).
Success
)
{
break
;
}
// if not, save HTML
File
.
WriteAllText
(
string
.
Format
(
@"C:\Work\EurLexScraper\Documents\{0}\page_{1}.html"
,
year
,
pg
),
driver
.
PageSource
,
Encoding
.
UTF8
);
}
}
}
else
// mode == Mode.Procedures
{
Console
.
WriteLine
(
"Fetching data
for year {0} ..."
,
year
);
driver
.
Navigate
().
GoToUrl
(
string
.
Format
(
"http://eur-lex.europa.eu/search.html?
CASE_LAW_SUMMARY=false&DTS_DOM=ALL&type=advanced&SUBDOM_INIT=ALL_ALL&DTS_SUBDOM=ALL_ALL&DD_YEAR={0}&page=1&locale=en
&sortOne=DD&sortOneOrder=
asc"
,
year
)
);
Console
.
WriteLine
(
"Fetching data
about procedures ..."
);
driver
.
Navigate
().
GoToUrl
(
"http://eur-lex.europa.eu/search.html?
SUBDOM_INIT=LEGAL_PROCEDURE&DTS_DOM=LEGAL_PROCEDURE&type=advanced&DTS_SUBDOM=LEGAL_PROCEDURE
&sortOne=DD&sortOneOrder=
desc&page=1&locale=en"
);
driver
.
FindElementByXPath
(
"//a[text()='Change displayed metadata']"
).
Click
();
WaitFor
(()
=>
{
driver
.
FindElementById
(
"nbResultPerPage"
);
driver
.
FindElementByXPath
(
"//a[text()='Advanced selection']"
);
driver
.
FindElementById
(
"advanced_c0textCommon"
);
driver
.
FindElementById
(
"advanced_c1datesCommon"
);
driver
.
FindElementById
(
"advanced_c2technicalMetCommon"
);
driver
.
FindElementById
(
"advanced_c3otherSearchCritCommon"
);
driver
.
FindElementById
(
"advanced_c0descriptors"
);
driver
.
FindElementById
(
"advanced_c1bibDetails"
);
driver
.
FindElementById
(
"advanced_c2dates"
);
driver
.
FindElementById
(
"advanced_c3relBetweenDocs"
);
driver
.
FindElementById
(
"advanced_c4caseLawParties"
);
driver
.
FindElementById
(
"advanced_c5caseLawOtherData"
);
driver
.
FindElementById
(
"advanced_c6otherSearchCrit"
);
driver
.
FindElementById
(
"advanced_c0textProc"
);
driver
.
FindElementById
(
"advanced_c1descriptorsProc"
);
driver
.
FindElementById
(
"advanced_c2bibDetailsProc"
);
driver
.
FindElementById
(
"advanced_c3relBetweenDocsProc"
);
driver
.
FindElementById
(
"advanced_c4technicalMetsProc"
);
driver
.
FindElementById
(
"button.apply"
);
return
true
;
});
driver
.
FindElementById
(
"nbResultPerPage"
).
SendKeys
(
"20"
);
driver
.
FindElementByXPath
(
"//a[text()='Advanced selection']"
).
Click
();
// select *all* metadata
driver
.
FindElementById
(
"advanced_c0textCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c1datesCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c2technicalMetCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c3otherSearchCritCommon"
).
Click
();
driver
.
FindElementById
(
"advanced_c0descriptors"
).
Click
();
driver
.
FindElementById
(
"advanced_c1bibDetails"
).
Click
();
driver
.
FindElementById
(
"advanced_c2dates"
).
Click
();
driver
.
FindElementById
(
"advanced_c3relBetweenDocs"
).
Click
();
driver
.
FindElementById
(
"advanced_c4caseLawParties"
).
Click
();
driver
.
FindElementById
(
"advanced_c5caseLawOtherData"
).
Click
();
driver
.
FindElementById
(
"advanced_c6otherSearchCrit"
).
Click
();
driver
.
FindElementById
(
"advanced_c0textProc"
).
Click
();
driver
.
FindElementById
(
"advanced_c1descriptorsProc"
).
Click
();
driver
.
FindElementById
(
"advanced_c2bibDetailsProc"
).
Click
();
driver
.
FindElementById
(
"advanced_c3relBetweenDocsProc"
).
Click
();
driver
.
FindElementById
(
"advanced_c4technicalMetsProc"
).
Click
();
driver
.
FindElementById
(
"button.apply"
).
Click
();
int
pg
=
0
;
Directory
.
CreateDirectory
(
string
.
Format
(
@"C:\Work\EurLexScraper\
Data\{0}"
,
year
)
);
Directory
.
CreateDirectory
(
@"C:\Work\EurLexScraper\
Procedures"
);
while
(
true
)
{
driver
.
Navigate
().
GoToUrl
(
Regex
.
Replace
(
driver
.
Url
,
@"page=\d+"
,
"page="
+
++
pg
));
// check if we're done
if
(!
Regex
.
Match
(
driver
.
Url
,
@"page=\d+"
).
Success
)
{
break
;
}
// if not, save HTML
File
.
WriteAllText
(
string
.
Format
(
@"C:\Work\EurLexScraper\
Data\{0}
\page_{
1
}.html"
,
year
,
pg
),
driver
.
PageSource
,
Encoding
.
UTF8
);
File
.
WriteAllText
(
string
.
Format
(
@"C:\Work\EurLexScraper\
Procedures
\page_{
0
}.html"
,
pg
),
driver
.
PageSource
,
Encoding
.
UTF8
);
}
}
Console
.
WriteLine
(
"Press any key to exit ..."
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment