Buenas
Pasos a seguir:
1) Obtener el código fuente para parsearlo.
2) Encontrar el elemento que contiene el valor, mediante técnica
XPATH o
REGEX, y obtener el valor en cuestión, del Ibex en este caso.
Solo debes examinar el source minuciosamente para rastrear el xpath correcto. Te muestro un ejemplo real en VB.Net utilizando la librería
HtmlAgilityPack, y también traducido a C# (quizás se traduzca incorrectamente):
vb.net
Public Class Form1
Private ReadOnly html As String =
<a><![CDATA[
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<body>
<div class="infolinks"><input type="hidden" name="IL_IN_TAG" value="1"/></div><div id="main">
<div class="music">
<h2 class="boxtitle">New releases \ <small>
<a href="/newalbums" title="New releases mp3 downloads" rel="bookmark">see all</a></small>
</h2>
<div class="item">
<div class="thumb">
<a href="http://www.mp3crank.com/curt-smith/deceptively-heavy-121861" rel="bookmark" lang="en" title="Curt Smith - Deceptively Heavy album downloads"><img width="100" height="100" alt="Mp3 downloads Curt Smith - Deceptively Heavy" title="Free mp3 downloads Curt Smith - Deceptively Heavy" src="http://www.mp3crank.com/cover-album/Curt-Smith-Deceptively-Heavy-400x400.jpg"/></a>
</div>
<div class="release">
<h3>Curt Smith</h3>
<h4>
<a href="http://www.mp3crank.com/curt-smith/deceptively-heavy-121861" title="Mp3 downloads Curt Smith - Deceptively Heavy">Deceptively Heavy</a>
</h4>
<script src="/ads/button.js"></script>
</div>
<div class="release-year">
<p>Year</p>
<span>2013</span>
</div>
<div class="genre">
<p>Genre</p>
<a href="http://www.mp3crank.com/genre/indie" rel="tag">Indie</a><a href="http://www.mp3crank.com/genre/pop" rel="tag">Pop</a>
</div>
</div>
<div class="item">
<div class="thumb">
<a href="http://www.mp3crank.com/wolf-eyes/lower-demos-121866" rel="bookmark" lang="en" title="Wolf Eyes - Lower Demos album downloads"><img width="100" height="100" alt="Mp3 downloads Wolf Eyes - Lower Demos" title="Free mp3 downloads Wolf Eyes - Lower Demos" src="http://www.mp3crank.com/cover-album/Wolf-Eyes-–-Lower-Demos.jpg" /></a>
</div>
<div class="release">
<h3>Wolf Eyes</h3>
<h4>
<a href="http://www.mp3crank.com/wolf-eyes/lower-demos-121866" title="Mp3 downloads Wolf Eyes - Lower Demos">Lower Demos</a>
</h4>
<script src="/ads/button.js"></script>
</div>
<div class="release-year">
<p>Year</p>
<span>2013</span>
</div>
<div class="genre">
<p>Genre</p>
<a href="http://www.mp3crank.com/genre/rock" rel="tag">Rock</a>
</div>
</div>
</div>
</div>
</body>
</html>
]]>$</a>.Value
Private sb As New System.Text.StringBuilder
Private htmldoc As HtmlAgilityPack.HtmlDocument = New HtmlAgilityPack.HtmlDocument
Private htmlnodes As HtmlAgilityPack.HtmlNodeCollection = Nothing
Private Title As String = String.Empty
Private Cover As String = String.Empty
Private Year As String = String.Empty
Private Genres As String() = {String.Empty}
Private URL As String = String.Empty
Private Sub Test() Handles MyBase.Shown
' Load the html document.
htmldoc.LoadHtml(html)
' Select the (10 items) nodes.
' All "SelectSingleNode" below will use this DIV element as a starting point.
htmlnodes = htmldoc.DocumentNode.SelectNodes("//div[@class='item']")
' Loop trough the nodes.
For Each node As HtmlAgilityPack.HtmlNode In htmlnodes
' Set the values:
Title = node.SelectSingleNode(".//div[@class='release']/h4/a[@title]").GetAttributeValue("title", "Unknown Title")
Cover = node.SelectSingleNode(".//div[@class='thumb']/a/img[@src]").GetAttributeValue("src", String.Empty)
Year = node.SelectSingleNode(".//div[@class='release-year']/span").InnerText
Genres = (From genre In node.SelectNodes(".//div[@class='genre']/a") Select genre.InnerText).ToArray
URL = node.SelectSingleNode(".//div[@class='release']/h4/a[@href]").GetAttributeValue("href", "Unknown URL")
' Display the values:
sb.Clear()
sb.AppendLine(String.Format("Title : {0}", Title))
sb.AppendLine(String.Format("Cover : {0}", Cover))
sb.AppendLine(String.Format("Year : {0}", Year))
sb.AppendLine(String.Format("Genres: {0}", String.Join(", ", Genres)))
sb.AppendLine(String.Format("URL : {0}", URL))
MsgBox(sb.ToString)
Next node
End Sub
End Class
c#:
using Microsoft.VisualBasic;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
public class Form1
{
private readonly string html
= new XElement
("a",
new XCData
("\n<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<body>\n\n\t<div class=\"infolinks\"><input type=\"hidden\" name=\"IL_IN_TAG\" value=\"1\"/></div><div id=\"main\">\n\n\t\t<div class=\"music\">\n\n\t\t\t<h2 class=\"boxtitle\">New releases \\ <small>\n\t\t\t\t<a href=\"/newalbums\" title=\"New releases mp3 downloads\" rel=\"bookmark\">see all</a></small>\n\t\t\t</h2>\n\n\t\t\t<div class=\"item\">\n\n\t \t\t<div class=\"thumb\">\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/curt-smith/deceptively-heavy-121861\" rel=\"bookmark\" lang=\"en\" title=\"Curt Smith - Deceptively Heavy album downloads\"><img width=\"100\" height=\"100\" alt=\"Mp3 downloads Curt Smith - Deceptively Heavy\" title=\"Free mp3 downloads Curt Smith - Deceptively Heavy\" src=\"http://www.mp3crank.com/cover-album/Curt-Smith-Deceptively-Heavy-400x400.jpg\"/></a>\n\t \t\t</div>\n\n\t\t\t\t<div class=\"release\">\n\t\t\t\t\t<h3>Curt Smith</h3>\n\t\t\t\t\t<h4>\n\t\t\t\t\t\t<a href=\"http://www.mp3crank.com/curt-smith/deceptively-heavy-121861\" title=\"Mp3 downloads Curt Smith - Deceptively Heavy\">Deceptively Heavy</a>\n\t\t\t\t\t</h4>\n\t\t\t\t\t<script src=\"/ads/button.js\"></script>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"release-year\">\n\t\t\t\t\t<p>Year</p>\n\t\t\t\t\t<span>2013</span>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"genre\">\n\t\t\t\t\t<p>Genre</p>\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/genre/indie\" rel=\"tag\">Indie</a><a href=\"http://www.mp3crank.com/genre/pop\" rel=\"tag\">Pop</a>\n\t\t\t\t</div>\n\n\t\t\t</div>\n\n\t\t\t<div class=\"item\">\n\n\t \t\t<div class=\"thumb\">\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/wolf-eyes/lower-demos-121866\" rel=\"bookmark\" lang=\"en\" title=\"Wolf Eyes - Lower Demos album downloads\"><img width=\"100\" height=\"100\" alt=\"Mp3 downloads Wolf Eyes - Lower Demos\" title=\"Free mp3 downloads Wolf Eyes - Lower Demos\" src=\"http://www.mp3crank.com/cover-album/Wolf-Eyes-–-Lower-Demos.jpg\" /></a>\n\t \t\t</div>\n\n\t\t\t\t<div class=\"release\">\n\t\t\t\t\t<h3>Wolf Eyes</h3>\n\t\t\t\t\t<h4>\n\t\t\t\t\t\t<a href=\"http://www.mp3crank.com/wolf-eyes/lower-demos-121866\" title=\"Mp3 downloads Wolf Eyes - Lower Demos\">Lower Demos</a>\n\t\t\t\t\t</h4>\n\t\t\t\t\t<script src=\"/ads/button.js\"></script>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"release-year\">\n\t\t\t\t\t<p>Year</p>\n\t\t\t\t\t<span>2013</span>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"genre\">\n\t\t\t\t\t<p>Genre</p>\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/genre/rock\" rel=\"tag\">Rock</a>\n\t\t\t\t</div>\n\n\t\t\t</div>\n\n\t\t</div>\n\n\t</div>\n\n</body>\n</html>\n")).Value;
private System.Text.StringBuilder sb
= new System.Text.StringBuilder(); private HtmlAgilityPack
.HtmlDocument htmldoc
= new HtmlAgilityPack
.HtmlDocument();
private HtmlAgilityPack.HtmlNodeCollection htmlnodes = null;
private string Title = string.Empty;
private string Cover = string.Empty;
private string Year = string.Empty;
private string[] Genres = { string.Empty };
private string URL = string.Empty;
private void Test()
{
// Load the html document.
htmldoc.LoadHtml(html);
// Select the (10 items) nodes.
// All "SelectSingleNode" below will use this DIV element as a starting point.
htmlnodes = htmldoc.DocumentNode.SelectNodes("//div[@class='item']");
// Loop trough the nodes.
foreach (HtmlAgilityPack.HtmlNode node in htmlnodes) {
// Set the values:
Title = node.SelectSingleNode(".//div[@class='release']/h4/a[@title]").GetAttributeValue("title", "Unknown Title");
Cover = node.SelectSingleNode(".//div[@class='thumb']/a/img[@src]").GetAttributeValue("src", string.Empty);
Year = node.SelectSingleNode(".//div[@class='release-year']/span").InnerText;
Genres = (from genre in node.SelectNodes(".//div[@class='genre']/a")genre.InnerText).ToArray;
URL = node.SelectSingleNode(".//div[@class='release']/h4/a[@href]").GetAttributeValue("href", "Unknown URL");
// Display the values:
sb.Clear();
sb.AppendLine(string.Format("Title : {0}", Title));
sb.AppendLine(string.Format("Cover : {0}", Cover));
sb.AppendLine(string.Format("Year : {0}", Year));
sb.AppendLine(string.Format("Genres: {0}", string.Join(", ", Genres)));
sb.AppendLine(string.Format("URL : {0}", URL));
Interaction.MsgBox(sb.ToString);
}
}
public Form1()
{
Shown += Test;
}
}
//=======================================================
//Service provided by Telerik (www.telerik.com)
//=======================================================
Plus este util snippet para buscar y obtener todos los xpath de un archivo html, para que sea más facil de llevarlo a cabo:
vb.net:
' Get Html XPaths
' By Elektro
'
' Example Usage:
'
' Dim Document As New HtmlAgilityPack.HtmlDocument
' Document.LoadHtml(IO.File.ReadAllText("C:\File.html"))
' Dim XpathList As List(Of String) = GetHtmlXPaths(Document)
' ListBox1.Items.AddRange((From XPath As String In XpathList Select XPath).ToArray)
''' <summary>
''' Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlDocument"/> document.
''' </summary>
''' <param name="Document">Indicates the <see cref="HtmlAgilityPack.HtmlDocument"/> document.</param>
''' <returns>List(Of System.String).</returns>
Public Function GetHtmlXPaths(ByVal Document As HtmlAgilityPack.HtmlDocument) As List(Of String)
Dim XPathList As New List(Of String)
Dim XPath As String = String.Empty
For Each Child As HtmlAgilityPack.HtmlNode In Document.DocumentNode.ChildNodes
If Child.NodeType = HtmlAgilityPack.HtmlNodeType.Element Then
GetHtmlXPaths(Child, XPathList, XPath)
End If
Next Child
Return XPathList
End Function
''' <summary>
''' Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlNode"/>.
''' </summary>
''' <param name="Node">Indicates the <see cref="HtmlAgilityPack.HtmlNode"/>.</param>
''' <param name="XPathList">Indicates a ByReffered XPath list as a <see cref="List(Of String)"/>.</param>
''' <param name="XPath">Indicates the current XPath.</param>
Private Sub GetHtmlXPaths(ByVal Node As HtmlAgilityPack.HtmlNode,
ByRef XPathList As List(Of String),
Optional ByVal XPath As String = Nothing)
XPath &= Node.XPath.Substring(Node.XPath.LastIndexOf("/"c))
Const ClassNameFilter As String = "[@class='{0}']"
Dim ClassName As String = Node.GetAttributeValue("class", String.Empty)
If Not String.IsNullOrEmpty(ClassName) Then
XPath &= String.Format(ClassNameFilter, ClassName)
End If
If Not XPathList.Contains(XPath) Then
XPathList.Add(XPath)
End If
For Each Child As HtmlAgilityPack.HtmlNode In Node.ChildNodes
If Child.NodeType = HtmlAgilityPack.HtmlNodeType.Element Then
GetHtmlXPaths(Child, XPathList, XPath)
End If
Next Child
End Sub
c#:
// Get Html XPaths
// By Elektro
/// <summary>
/// Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlDocument"/> document.
/// </summary>
/// <param name="Document">Indicates the <see cref="HtmlAgilityPack.HtmlDocument"/> document.</param>
/// <returns>List(Of System.String).</returns>
public List<string> GetHtmlXPaths(HtmlAgilityPack.HtmlDocument Document)
{
List
<string> XPathList
= new List
<string>(); string XPath = string.Empty;
foreach (HtmlAgilityPack.HtmlNode Child in Document.DocumentNode.ChildNodes) {
if (Child.NodeType == HtmlAgilityPack.HtmlNodeType.Element) {
GetHtmlXPaths(Child, ref XPathList, XPath);
}
}
return XPathList;
}
/// <summary>
/// Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlNode"/>.
/// </summary>
/// <param name="Node">Indicates the <see cref="HtmlAgilityPack.HtmlNode"/>.</param>
/// <param name="XPathList">Indicates a ByReffered XPath list as a <see cref="List(Of String)"/>.</param>
/// <param name="XPath">Indicates the current XPath.</param>
private void GetHtmlXPaths(HtmlAgilityPack.HtmlNode Node, ref List<string> XPathList, string XPath = null)
{
XPath += Node.XPath.Substring(Node.XPath.LastIndexOf('/'));
const string ClassNameFilter = "[@class='{0}']";
string ClassName = Node.GetAttributeValue("class", string.Empty);
if (!string.IsNullOrEmpty(ClassName)) {
XPath += string.Format(ClassNameFilter, ClassName);
}
if (!XPathList.Contains(XPath)) {
XPathList.Add(XPath);
}
foreach (HtmlAgilityPack.HtmlNode Child in Node.ChildNodes) {
if (Child.NodeType == HtmlAgilityPack.HtmlNodeType.Element) {
GetHtmlXPaths(Child, ref XPathList, XPath);
}
}
}
//=======================================================
//Service provided by Telerik (www.telerik.com)
//=======================================================
saludos