Commit 2e715679 authored by Miha's avatar Miha
Browse files

Now able to scrape procedure meta-data

parent 774d2f5d
......@@ -2,4 +2,5 @@ bin
obj
*.suo
*.bak
Data
\ No newline at end of file
Documents
Procedures
\ No newline at end of file
......@@ -22,56 +22,102 @@ namespace EurLexScraper
}
}
enum Mode
{
Documents,
Procedures
}
static void Main(string[] args)
{
Mode mode = Mode.Procedures;
using (var driver = new ChromeDriver())
{
for (int year = 2010; year >= 2000; year--)
if (mode == Mode.Documents)
{
for (int year = 2005; year >= 2000; year--)
{
Console.WriteLine("Fetching data for year {0} ...", year);
driver.Navigate().GoToUrl(string.Format("http://eur-lex.europa.eu/search.html?CASE_LAW_SUMMARY=false&DTS_DOM=ALL&type=advanced&SUBDOM_INIT=ALL_ALL&DTS_SUBDOM=ALL_ALL&DD_YEAR={0}&page=1&locale=en&sortOne=DD&sortOneOrder=asc", year));
driver.FindElementByXPath("//a[text()='Change displayed metadata']").Click();
WaitFor(() => {
driver.FindElementById("nbResultPerPage");
driver.FindElementByXPath("//a[text()='Advanced selection']");
driver.FindElementById("advanced_c0textCommon");
driver.FindElementById("advanced_c1datesCommon");
driver.FindElementById("advanced_c2technicalMetCommon");
driver.FindElementById("advanced_c3otherSearchCritCommon");
driver.FindElementById("advanced_c0descriptors");
driver.FindElementById("advanced_c1bibDetails");
driver.FindElementById("advanced_c2dates");
driver.FindElementById("advanced_c3relBetweenDocs");
driver.FindElementById("advanced_c4caseLawParties");
driver.FindElementById("advanced_c5caseLawOtherData");
driver.FindElementById("advanced_c6otherSearchCrit");
driver.FindElementById("button.apply");
return true;
});
driver.FindElementById("nbResultPerPage").SendKeys("20");
driver.FindElementByXPath("//a[text()='Advanced selection']").Click();
// select *all* metadata
driver.FindElementById("advanced_c0textCommon").Click();
driver.FindElementById("advanced_c1datesCommon").Click();
driver.FindElementById("advanced_c2technicalMetCommon").Click();
driver.FindElementById("advanced_c3otherSearchCritCommon").Click();
driver.FindElementById("advanced_c0descriptors").Click();
driver.FindElementById("advanced_c1bibDetails").Click();
driver.FindElementById("advanced_c2dates").Click();
driver.FindElementById("advanced_c3relBetweenDocs").Click();
driver.FindElementById("advanced_c4caseLawParties").Click();
driver.FindElementById("advanced_c5caseLawOtherData").Click();
driver.FindElementById("advanced_c6otherSearchCrit").Click();
driver.FindElementById("button.apply").Click();
int pg = 0;
Directory.CreateDirectory(string.Format(@"C:\Work\EurLexScraper\Documents\{0}", year));
while (true)
{
driver.Navigate().GoToUrl(Regex.Replace(driver.Url, @"page=\d+", "page=" + ++pg));
// check if we're done
if (!Regex.Match(driver.Url, @"page=\d+").Success) { break; }
// if not, save HTML
File.WriteAllText(string.Format(@"C:\Work\EurLexScraper\Documents\{0}\page_{1}.html", year, pg), driver.PageSource, Encoding.UTF8);
}
}
}
else // mode == Mode.Procedures
{
Console.WriteLine("Fetching data for year {0} ...", year);
driver.Navigate().GoToUrl(string.Format("http://eur-lex.europa.eu/search.html?CASE_LAW_SUMMARY=false&DTS_DOM=ALL&type=advanced&SUBDOM_INIT=ALL_ALL&DTS_SUBDOM=ALL_ALL&DD_YEAR={0}&page=1&locale=en&sortOne=DD&sortOneOrder=asc", year));
Console.WriteLine("Fetching data about procedures ...");
driver.Navigate().GoToUrl("http://eur-lex.europa.eu/search.html?SUBDOM_INIT=LEGAL_PROCEDURE&DTS_DOM=LEGAL_PROCEDURE&type=advanced&DTS_SUBDOM=LEGAL_PROCEDURE&sortOne=DD&sortOneOrder=desc&page=1&locale=en");
driver.FindElementByXPath("//a[text()='Change displayed metadata']").Click();
WaitFor(() => {
driver.FindElementById("nbResultPerPage");
driver.FindElementByXPath("//a[text()='Advanced selection']");
driver.FindElementById("advanced_c0textCommon");
driver.FindElementById("advanced_c1datesCommon");
driver.FindElementById("advanced_c2technicalMetCommon");
driver.FindElementById("advanced_c3otherSearchCritCommon");
driver.FindElementById("advanced_c0descriptors");
driver.FindElementById("advanced_c1bibDetails");
driver.FindElementById("advanced_c2dates");
driver.FindElementById("advanced_c3relBetweenDocs");
driver.FindElementById("advanced_c4caseLawParties");
driver.FindElementById("advanced_c5caseLawOtherData");
driver.FindElementById("advanced_c6otherSearchCrit");
driver.FindElementById("advanced_c0textProc");
driver.FindElementById("advanced_c1descriptorsProc");
driver.FindElementById("advanced_c2bibDetailsProc");
driver.FindElementById("advanced_c3relBetweenDocsProc");
driver.FindElementById("advanced_c4technicalMetsProc");
driver.FindElementById("button.apply");
return true;
});
driver.FindElementById("nbResultPerPage").SendKeys("20");
driver.FindElementByXPath("//a[text()='Advanced selection']").Click();
// select *all* metadata
driver.FindElementById("advanced_c0textCommon").Click();
driver.FindElementById("advanced_c1datesCommon").Click();
driver.FindElementById("advanced_c2technicalMetCommon").Click();
driver.FindElementById("advanced_c3otherSearchCritCommon").Click();
driver.FindElementById("advanced_c0descriptors").Click();
driver.FindElementById("advanced_c1bibDetails").Click();
driver.FindElementById("advanced_c2dates").Click();
driver.FindElementById("advanced_c3relBetweenDocs").Click();
driver.FindElementById("advanced_c4caseLawParties").Click();
driver.FindElementById("advanced_c5caseLawOtherData").Click();
driver.FindElementById("advanced_c6otherSearchCrit").Click();
driver.FindElementById("advanced_c0textProc").Click();
driver.FindElementById("advanced_c1descriptorsProc").Click();
driver.FindElementById("advanced_c2bibDetailsProc").Click();
driver.FindElementById("advanced_c3relBetweenDocsProc").Click();
driver.FindElementById("advanced_c4technicalMetsProc").Click();
driver.FindElementById("button.apply").Click();
int pg = 0;
Directory.CreateDirectory(string.Format(@"C:\Work\EurLexScraper\Data\{0}", year));
Directory.CreateDirectory(@"C:\Work\EurLexScraper\Procedures");
while (true)
{
driver.Navigate().GoToUrl(Regex.Replace(driver.Url, @"page=\d+", "page=" + ++pg));
// check if we're done
if (!Regex.Match(driver.Url, @"page=\d+").Success) { break; }
// if not, save HTML
File.WriteAllText(string.Format(@"C:\Work\EurLexScraper\Data\{0}\page_{1}.html", year, pg), driver.PageSource, Encoding.UTF8);
File.WriteAllText(string.Format(@"C:\Work\EurLexScraper\Procedures\page_{0}.html", pg), driver.PageSource, Encoding.UTF8);
}
}
Console.WriteLine("Press any key to exit ...");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment