Foro de elhacker.net

Programación => .NET (C#, VB.NET, ASP) => Mensaje iniciado por: j0lama en 1 Febrero 2015, 23:06 pm



Título: Obtener texto de un elemento de una web
Publicado por: j0lama en 1 Febrero 2015, 23:06 pm
Hola buenas mi duda es que como puedo hacer para que de una web, por ejemplo una web que muestre el IBEX 35, establecer en un label el valor que ha caido/subido el IBEX
es decir el texto que te da la web (valor de la caída) ponerlo en un label
Por favor expliquenmelo en c# que VB no se me da muy bien xD
Gracias


Título: Re: Obtener texto de un elemento de una web
Publicado por: Eleкtro en 2 Febrero 2015, 02:12 am
Buenas

Pasos a seguir:

1) Obtener el código fuente para parsearlo.

2) Encontrar el elemento que contiene el valor, mediante técnica XPATH o REGEX, y obtener el valor en cuestión, del Ibex en este caso.

Solo debes examinar el source minuciosamente para rastrear el xpath correcto. Te muestro un ejemplo real en VB.Net utilizando la librería HtmlAgilityPack, y también traducido a C# (quizás se traduzca incorrectamente):

vb.net
Código
  1. Public Class Form1
  2.  
  3.    Private ReadOnly html As String =
  4.        <a><![CDATA[
  5. <!DOCTYPE html>
  6. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  7. <body>
  8.  
  9. <div class="infolinks"><input type="hidden" name="IL_IN_TAG" value="1"/></div><div id="main">
  10.  
  11. <div class="music">
  12.  
  13. <h2 class="boxtitle">New releases \ <small>
  14. <a href="/newalbums" title="New releases mp3 downloads" rel="bookmark">see all</a></small>
  15. </h2>
  16.  
  17. <div class="item">
  18.  
  19.     <div class="thumb">
  20. <a href="http://www.mp3crank.com/curt-smith/deceptively-heavy-121861" rel="bookmark" lang="en" title="Curt Smith - Deceptively Heavy album downloads"><img width="100" height="100" alt="Mp3 downloads Curt Smith - Deceptively Heavy" title="Free mp3 downloads Curt Smith - Deceptively Heavy" src="http://www.mp3crank.com/cover-album/Curt-Smith-Deceptively-Heavy-400x400.jpg"/></a>
  21.     </div>
  22.  
  23. <div class="release">
  24. <h3>Curt Smith</h3>
  25. <h4>
  26. <a href="http://www.mp3crank.com/curt-smith/deceptively-heavy-121861" title="Mp3 downloads Curt Smith - Deceptively Heavy">Deceptively Heavy</a>
  27. </h4>
  28. <script src="/ads/button.js"></script>
  29. </div>
  30.  
  31. <div class="release-year">
  32. <p>Year</p>
  33. <span>2013</span>
  34. </div>
  35.  
  36. <div class="genre">
  37. <p>Genre</p>
  38. <a href="http://www.mp3crank.com/genre/indie" rel="tag">Indie</a><a href="http://www.mp3crank.com/genre/pop" rel="tag">Pop</a>
  39. </div>
  40.  
  41. </div>
  42.  
  43. <div class="item">
  44.  
  45.     <div class="thumb">
  46. <a href="http://www.mp3crank.com/wolf-eyes/lower-demos-121866" rel="bookmark" lang="en" title="Wolf Eyes - Lower Demos album downloads"><img width="100" height="100" alt="Mp3 downloads Wolf Eyes - Lower Demos" title="Free mp3 downloads Wolf Eyes - Lower Demos" src="http://www.mp3crank.com/cover-album/Wolf-Eyes-–-Lower-Demos.jpg" /></a>
  47.     </div>
  48.  
  49. <div class="release">
  50. <h3>Wolf Eyes</h3>
  51. <h4>
  52. <a href="http://www.mp3crank.com/wolf-eyes/lower-demos-121866" title="Mp3 downloads Wolf Eyes - Lower Demos">Lower Demos</a>
  53. </h4>
  54. <script src="/ads/button.js"></script>
  55. </div>
  56.  
  57. <div class="release-year">
  58. <p>Year</p>
  59. <span>2013</span>
  60. </div>
  61.  
  62. <div class="genre">
  63. <p>Genre</p>
  64. <a href="http://www.mp3crank.com/genre/rock" rel="tag">Rock</a>
  65. </div>
  66.  
  67. </div>
  68.  
  69. </div>
  70.  
  71. </div>
  72.  
  73. </body>
  74. </html>
  75. ]]>$</a>.Value
  76.  
  77.    Private sb As New System.Text.StringBuilder
  78.  
  79.    Private htmldoc As HtmlAgilityPack.HtmlDocument = New HtmlAgilityPack.HtmlDocument
  80.    Private htmlnodes As HtmlAgilityPack.HtmlNodeCollection = Nothing
  81.  
  82.    Private Title As String = String.Empty
  83.    Private Cover As String = String.Empty
  84.    Private Year As String = String.Empty
  85.    Private Genres As String() = {String.Empty}
  86.    Private URL As String = String.Empty
  87.  
  88.    Private Sub Test() Handles MyBase.Shown
  89.  
  90.        ' Load the html document.
  91.        htmldoc.LoadHtml(html)
  92.  
  93.        ' Select the (10 items) nodes.
  94.        ' All "SelectSingleNode" below will use this DIV element as a starting point.
  95.        htmlnodes = htmldoc.DocumentNode.SelectNodes("//div[@class='item']")
  96.  
  97.        ' Loop trough the nodes.
  98.        For Each node As HtmlAgilityPack.HtmlNode In htmlnodes
  99.  
  100.             ' Set the values:
  101.            Title = node.SelectSingleNode(".//div[@class='release']/h4/a[@title]").GetAttributeValue("title", "Unknown Title")
  102.            Cover = node.SelectSingleNode(".//div[@class='thumb']/a/img[@src]").GetAttributeValue("src", String.Empty)
  103.            Year = node.SelectSingleNode(".//div[@class='release-year']/span").InnerText
  104.            Genres = (From genre In node.SelectNodes(".//div[@class='genre']/a") Select genre.InnerText).ToArray
  105.            URL = node.SelectSingleNode(".//div[@class='release']/h4/a[@href]").GetAttributeValue("href", "Unknown URL")
  106.  
  107.            ' Display the values:
  108.            sb.Clear()
  109.            sb.AppendLine(String.Format("Title : {0}", Title))
  110.            sb.AppendLine(String.Format("Cover : {0}", Cover))
  111.            sb.AppendLine(String.Format("Year  : {0}", Year))
  112.            sb.AppendLine(String.Format("Genres: {0}", String.Join(", ", Genres)))
  113.            sb.AppendLine(String.Format("URL   : {0}", URL))
  114.            MsgBox(sb.ToString)
  115.  
  116.        Next node
  117.  
  118.    End Sub
  119.  
  120. End Class

c#:
Código
  1. using Microsoft.VisualBasic;
  2. using System;
  3. using System.Collections;
  4. using System.Collections.Generic;
  5. using System.Data;
  6. using System.Diagnostics;
  7. public class Form1
  8. {
  9.  
  10.  
  11. private readonly string html = new XElement("a", new XCData("\n<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<body>\n\n\t<div class=\"infolinks\"><input type=\"hidden\" name=\"IL_IN_TAG\" value=\"1\"/></div><div id=\"main\">\n\n\t\t<div class=\"music\">\n\n\t\t\t<h2 class=\"boxtitle\">New releases \\ <small>\n\t\t\t\t<a href=\"/newalbums\" title=\"New releases mp3 downloads\" rel=\"bookmark\">see all</a></small>\n\t\t\t</h2>\n\n\t\t\t<div class=\"item\">\n\n\t    \t\t<div class=\"thumb\">\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/curt-smith/deceptively-heavy-121861\" rel=\"bookmark\" lang=\"en\" title=\"Curt Smith - Deceptively Heavy album downloads\"><img width=\"100\" height=\"100\" alt=\"Mp3 downloads Curt Smith - Deceptively Heavy\" title=\"Free mp3 downloads Curt Smith - Deceptively Heavy\" src=\"http://www.mp3crank.com/cover-album/Curt-Smith-Deceptively-Heavy-400x400.jpg\"/></a>\n\t    \t\t</div>\n\n\t\t\t\t<div class=\"release\">\n\t\t\t\t\t<h3>Curt Smith</h3>\n\t\t\t\t\t<h4>\n\t\t\t\t\t\t<a href=\"http://www.mp3crank.com/curt-smith/deceptively-heavy-121861\" title=\"Mp3 downloads Curt Smith - Deceptively Heavy\">Deceptively Heavy</a>\n\t\t\t\t\t</h4>\n\t\t\t\t\t<script src=\"/ads/button.js\"></script>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"release-year\">\n\t\t\t\t\t<p>Year</p>\n\t\t\t\t\t<span>2013</span>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"genre\">\n\t\t\t\t\t<p>Genre</p>\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/genre/indie\" rel=\"tag\">Indie</a><a href=\"http://www.mp3crank.com/genre/pop\" rel=\"tag\">Pop</a>\n\t\t\t\t</div>\n\n\t\t\t</div>\n\n\t\t\t<div class=\"item\">\n\n\t    \t\t<div class=\"thumb\">\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/wolf-eyes/lower-demos-121866\" rel=\"bookmark\" lang=\"en\" title=\"Wolf Eyes - Lower Demos album downloads\"><img width=\"100\" height=\"100\" alt=\"Mp3 downloads Wolf Eyes - Lower Demos\" title=\"Free mp3 downloads Wolf Eyes - Lower Demos\" src=\"http://www.mp3crank.com/cover-album/Wolf-Eyes-–-Lower-Demos.jpg\" /></a>\n\t    \t\t</div>\n\n\t\t\t\t<div class=\"release\">\n\t\t\t\t\t<h3>Wolf Eyes</h3>\n\t\t\t\t\t<h4>\n\t\t\t\t\t\t<a href=\"http://www.mp3crank.com/wolf-eyes/lower-demos-121866\" title=\"Mp3 downloads Wolf Eyes - Lower Demos\">Lower Demos</a>\n\t\t\t\t\t</h4>\n\t\t\t\t\t<script src=\"/ads/button.js\"></script>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"release-year\">\n\t\t\t\t\t<p>Year</p>\n\t\t\t\t\t<span>2013</span>\n\t\t\t\t</div>\n\n\t\t\t\t<div class=\"genre\">\n\t\t\t\t\t<p>Genre</p>\n\t\t\t\t\t<a href=\"http://www.mp3crank.com/genre/rock\" rel=\"tag\">Rock</a>\n\t\t\t\t</div>\n\n\t\t\t</div>\n\n\t\t</div>\n\n\t</div>\n\n</body>\n</html>\n")).Value;
  12.  
  13. private System.Text.StringBuilder sb = new System.Text.StringBuilder();
  14. private HtmlAgilityPack.HtmlDocument htmldoc = new HtmlAgilityPack.HtmlDocument();
  15.  
  16. private HtmlAgilityPack.HtmlNodeCollection htmlnodes = null;
  17. private string Title = string.Empty;
  18. private string Cover = string.Empty;
  19. private string Year = string.Empty;
  20. private string[] Genres = { string.Empty };
  21.  
  22. private string URL = string.Empty;
  23.  
  24. private void Test()
  25. {
  26. // Load the html document.
  27. htmldoc.LoadHtml(html);
  28.  
  29. // Select the (10 items) nodes.
  30. // All "SelectSingleNode" below will use this DIV element as a starting point.
  31. htmlnodes = htmldoc.DocumentNode.SelectNodes("//div[@class='item']");
  32.  
  33. // Loop trough the nodes.
  34.  
  35. foreach (HtmlAgilityPack.HtmlNode node in htmlnodes) {
  36. // Set the values:
  37. Title = node.SelectSingleNode(".//div[@class='release']/h4/a[@title]").GetAttributeValue("title", "Unknown Title");
  38. Cover = node.SelectSingleNode(".//div[@class='thumb']/a/img[@src]").GetAttributeValue("src", string.Empty);
  39. Year = node.SelectSingleNode(".//div[@class='release-year']/span").InnerText;
  40. Genres = (from genre in node.SelectNodes(".//div[@class='genre']/a")genre.InnerText).ToArray;
  41. URL = node.SelectSingleNode(".//div[@class='release']/h4/a[@href]").GetAttributeValue("href", "Unknown URL");
  42.  
  43. // Display the values:
  44. sb.Clear();
  45. sb.AppendLine(string.Format("Title : {0}", Title));
  46. sb.AppendLine(string.Format("Cover : {0}", Cover));
  47. sb.AppendLine(string.Format("Year  : {0}", Year));
  48. sb.AppendLine(string.Format("Genres: {0}", string.Join(", ", Genres)));
  49. sb.AppendLine(string.Format("URL   : {0}", URL));
  50. Interaction.MsgBox(sb.ToString);
  51.  
  52. }
  53.  
  54. }
  55. public Form1()
  56. {
  57. Shown += Test;
  58. }
  59.  
  60. }
  61.  
  62. //=======================================================
  63. //Service provided by Telerik (www.telerik.com)
  64. //=======================================================

Plus este util snippet para buscar y obtener todos los xpath de un archivo html, para que sea más facil de llevarlo a cabo:

vb.net:
Código
  1.    ' Get Html XPaths
  2.    ' By Elektro
  3.    '
  4.    ' Example Usage:
  5.    '
  6.    ' Dim Document As New HtmlAgilityPack.HtmlDocument
  7.    ' Document.LoadHtml(IO.File.ReadAllText("C:\File.html"))
  8.    ' Dim XpathList As List(Of String) = GetHtmlXPaths(Document)
  9.    ' ListBox1.Items.AddRange((From XPath As String In XpathList Select XPath).ToArray)
  10.  
  11.    ''' <summary>
  12.    ''' Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlDocument"/> document.
  13.    ''' </summary>
  14.    ''' <param name="Document">Indicates the <see cref="HtmlAgilityPack.HtmlDocument"/> document.</param>
  15.    ''' <returns>List(Of System.String).</returns>
  16.    Public Function GetHtmlXPaths(ByVal Document As HtmlAgilityPack.HtmlDocument) As List(Of String)
  17.  
  18.        Dim XPathList As New List(Of String)
  19.        Dim XPath As String = String.Empty
  20.  
  21.        For Each Child As HtmlAgilityPack.HtmlNode In Document.DocumentNode.ChildNodes
  22.  
  23.            If Child.NodeType = HtmlAgilityPack.HtmlNodeType.Element Then
  24.                GetHtmlXPaths(Child, XPathList, XPath)
  25.            End If
  26.  
  27.        Next Child
  28.  
  29.        Return XPathList
  30.  
  31.    End Function
  32.  
  33.    ''' <summary>
  34.    ''' Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlNode"/>.
  35.    ''' </summary>
  36.    ''' <param name="Node">Indicates the <see cref="HtmlAgilityPack.HtmlNode"/>.</param>
  37.    ''' <param name="XPathList">Indicates a ByReffered XPath list as a <see cref="List(Of String)"/>.</param>
  38.    ''' <param name="XPath">Indicates the current XPath.</param>
  39.    Private Sub GetHtmlXPaths(ByVal Node As HtmlAgilityPack.HtmlNode,
  40.                              ByRef XPathList As List(Of String),
  41.                              Optional ByVal XPath As String = Nothing)
  42.  
  43.        XPath &= Node.XPath.Substring(Node.XPath.LastIndexOf("/"c))
  44.  
  45.        Const ClassNameFilter As String = "[@class='{0}']"
  46.        Dim ClassName As String = Node.GetAttributeValue("class", String.Empty)
  47.  
  48.        If Not String.IsNullOrEmpty(ClassName) Then
  49.            XPath &= String.Format(ClassNameFilter, ClassName)
  50.        End If
  51.  
  52.        If Not XPathList.Contains(XPath) Then
  53.            XPathList.Add(XPath)
  54.        End If
  55.  
  56.        For Each Child As HtmlAgilityPack.HtmlNode In Node.ChildNodes
  57.  
  58.            If Child.NodeType = HtmlAgilityPack.HtmlNodeType.Element Then
  59.                GetHtmlXPaths(Child, XPathList, XPath)
  60.            End If
  61.  
  62.        Next Child
  63.  
  64.    End Sub

c#:
Código
  1. // Get Html XPaths
  2. // By Elektro
  3.  
  4. /// <summary>
  5. /// Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlDocument"/> document.
  6. /// </summary>
  7. /// <param name="Document">Indicates the <see cref="HtmlAgilityPack.HtmlDocument"/> document.</param>
  8. /// <returns>List(Of System.String).</returns>
  9. public List<string> GetHtmlXPaths(HtmlAgilityPack.HtmlDocument Document)
  10. {
  11.  
  12. List<string> XPathList = new List<string>();
  13. string XPath = string.Empty;
  14.  
  15.  
  16. foreach (HtmlAgilityPack.HtmlNode Child in Document.DocumentNode.ChildNodes) {
  17. if (Child.NodeType == HtmlAgilityPack.HtmlNodeType.Element) {
  18. GetHtmlXPaths(Child, ref XPathList, XPath);
  19. }
  20.  
  21. }
  22.  
  23. return XPathList;
  24.  
  25. }
  26.  
  27. /// <summary>
  28. /// Gets all the XPath expressions of an <see cref="HtmlAgilityPack.HtmlNode"/>.
  29. /// </summary>
  30. /// <param name="Node">Indicates the <see cref="HtmlAgilityPack.HtmlNode"/>.</param>
  31. /// <param name="XPathList">Indicates a ByReffered XPath list as a <see cref="List(Of String)"/>.</param>
  32. /// <param name="XPath">Indicates the current XPath.</param>
  33.  
  34. private void GetHtmlXPaths(HtmlAgilityPack.HtmlNode Node, ref List<string> XPathList, string XPath = null)
  35. {
  36. XPath += Node.XPath.Substring(Node.XPath.LastIndexOf('/'));
  37.  
  38. const string ClassNameFilter = "[@class='{0}']";
  39. string ClassName = Node.GetAttributeValue("class", string.Empty);
  40.  
  41. if (!string.IsNullOrEmpty(ClassName)) {
  42. XPath += string.Format(ClassNameFilter, ClassName);
  43. }
  44.  
  45. if (!XPathList.Contains(XPath)) {
  46. XPathList.Add(XPath);
  47. }
  48.  
  49.  
  50. foreach (HtmlAgilityPack.HtmlNode Child in Node.ChildNodes) {
  51. if (Child.NodeType == HtmlAgilityPack.HtmlNodeType.Element) {
  52. GetHtmlXPaths(Child, ref XPathList, XPath);
  53. }
  54.  
  55. }
  56.  
  57. }
  58.  
  59. //=======================================================
  60. //Service provided by Telerik (www.telerik.com)
  61. //=======================================================
  62.  

saludos