Parsing DOM with PHP
From Leo's Notes
Last edited on 3 June 2017, at 02:36.
Here's a dirty script that parses the XML feed provided by Environment Canada.
$data = file_get_contents("http://dd.weatheroffice.ec.gc.ca/citypage_weather/xml/AB/s0000047_e.xml");
$weather = new SimpleXMLElement($data);
$weather_data->location = (string) $weather->location->name;
$weather_data->date = (string) $weather->dateTime[1]->textSummary;
$weather_data->text = (string) $weather->currentConditions->condition;
$weather_data->forecastAbbreviated = (string) $weather->forecastGroup->forecast->abbreviatedForecast->textSummary;
$weather_data->forecast = (string) $weather->forecastGroup->forecast->textSummary;
$weather_data->pressure = (string) $weather->currentConditions->pressure;
$weather_data->temp = (string) $weather->currentConditions->temperature;
$forecasts = array();
foreach($weather->forecastGroup->forecast as $forecast) {
$forecasts[] = array(
"period" => (string)$forecast->period,
"forecast" => (string)$forecast->textSummary,
"text" => (string)$forecast->abbreviatedForecast->textSummary,
"pop" => (int)$forecast->abbreviatedForecast->pop,
"temperature" => (int)$forecast->temperatures->temperature,
"windchill" => (int)$forecast->windChill->calculated,
"icon" => (string)$forecast->abbreviatedForecast->iconCode
);
}
$weather_data->forecasts = $forecasts;
$icon = "http://weather.gc.ca/weathericons/" . $weather->currentConditions->iconCode . ".gif";
$weather_data->icon = $icon;
$weather_data->source = "Environment Canada";
For something more complicated, you may want to take a look into XPath which allows you to navigate and search DOM elements.
For example, I used the following script to scrape the workrooms provided by the libCal service.
function parse_schedule($html) {
$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXpath($dom);
$rooms = [];
$bookings = $xpath->query("//*[@id='s-lc-rm-tg-rnames']/*");
foreach($bookings as $el) {
$room_number = trim($el->textContent);
// nonbreaking space utf8 to ascii
$room_number = trim($room_number, "\xc2\xa0");
if (trim($room_number) != "") {
$rooms[] = $room_number;
}
}
$hours = [];
$hour_elements = $xpath->query("//*[@class='lc_rm_hours']/div");
foreach($hour_elements as $el) {
$hours[] = $el->textContent;
}
$bookings = [];
$schedule = $xpath->query("//*[@id='s-lc-rm-scrolltb']/*/*/*[@class='lc_rm_b']|//*[@id='s-lc-rm-scrolltb']/*/*/*[@class='lc_rm_a']|//*[@id='s-lc-rm-scrolltb']/*/*/*[@class='lc_rm_u']"); // /*[@class=lc_rm_b|@class=lc_rm_u|@class=lc_rm_a]");
foreach($schedule as $el) {
$bookings[] = $el->getAttribute('class');
}
if (count($bookings) != count($rooms) * count($hours)) {
echo "Item count mismatch...";
echo "Got " . count($bookings) . " bookings and expected " . count($rooms) . " x " . count($hours) ;
echo "\n";
exit;
}
$schedule = [];
$i = 0;
foreach($rooms as $room) {
$schedule[$room] = [];
foreach($hours as $hour) {
$dateparsed = date_parse($hour);
$schedule[$room][ $dateparsed['hour'] ] = class_to_status($bookings[$i]);
$i++;
}
}
return $schedule;
}