Berechnet das Maß der Ähnlichkeit zweier Strings.
Mehr Infos zu Levenshtein (nicht: Levenstein) bietet ... Wikipedia
Funktion:
Spoiler anzeigen
Nachtrach:
Hab mal ein bißchen rumgebastelt und das ganze in ne Klasse gepackt und ne Testfunktion geschrieben. Außerdem hab ich mal Multithreading implementiert.
Klasse:
Spoiler anzeigen
Testform:
Spoiler anzeigen
Die singlethreaded Methode braucht bei mir (dual-core) im schnitt 1050 ms, die Multithread Methode im Schnitt 700 ms.
Wenn jemand das ganze evtl mal auf nem i920 mit aktiviertem HT ausprobieren könnte ...
(PS: Anzahl der threads dann natürlich auf 4 oder 8 setzen!)
Wortliste runterladen nicht vergessen!
Mehr Infos zu Levenshtein (nicht: Levenstein) bietet ... Wikipedia
Funktion:
VB.NET-Quellcode
- ''' <summary>
- ''' Berechnet die Levenshtein Distanz zweier Strings.
- ''' Dies ist quasi ein Maß der Ähnlichkeit
- ''' </summary>
- ''' <param name="s">String 1</param>
- ''' <param name="t">String 2</param>
- ''' <returns>Levenshtein Distanz</returns>
- ''' <remarks>C# Sample nach VB portiert
- ''' Original: http://www.merriampark.com/ldcsharp.htm
- ''' </remarks>
- Public Function Levenshtein(ByVal s As String, ByVal t As String) As Integer
- Dim n As Integer = s.Length
- Dim m As Integer = t.Length
- Dim d(n + 1, m + 1) As Integer ' matrix
- Dim cost As Integer
- ' Step 1
- If n = 0 Then Return m
- If m = 0 Then Return n
- ' Step 2
- For i As Integer = 0 To n
- d(i, 0) = i
- Next
- For j As Integer = 0 To m
- d(0, j) = j
- Next
- ' Step 3
- For i = 1 To n
- For j = 1 To m
- ' Kosten für eine Position:
- If t(j - 1) = s(i - 1) Then
- cost = 0
- Else
- cost = 1
- End If
- d(i, j) = System.Math.Min(System.Math.Min(d(i - 1, j) + 1, d(i, j - 1) + 1), d(i - 1, j - 1) + cost)
- Next j
- Next i
- Return d(n, m)
- End Function
Nachtrach:
Hab mal ein bißchen rumgebastelt und das ganze in ne Klasse gepackt und ne Testfunktion geschrieben. Außerdem hab ich mal Multithreading implementiert.
Klasse:
VB.NET-Quellcode
- Imports System.IO
- Imports System.Threading
- Public Class Levenshtein
- Private Structure ThreadArgs
- Dim SearchString As String
- Dim SrcArray() As String
- Dim startIndex As Integer
- Dim endInex As Integer
- Dim DstList As List(Of String)
- Dim MaxDistanz As Integer
- Dim ARE As AutoResetEvent
- End Structure
- Private Shared WordList() As String
- Public Shared Sub DoubleList()
- Dim tmp(WordList.Length * 2 - 1) As String
- For i = 0 To tmp.Length - 1
- tmp(i) = WordList(i Mod WordList.Length)
- Next
- WordList = tmp
- End Sub
- Public Shared Function GetWordList(ByVal fname As String) As Integer
- Try
- WordList = IO.File.ReadAllLines(fname)
- Return WordList.Count
- Catch ex As Exception
- Return -1
- End Try
- End Function
- Public Shared Function GetThreadedLevenshteinList(ByVal s As String, ByVal MaxDistanz As Integer, ByVal NumThreads As Integer) As List(Of String)
- Dim l(NumThreads - 1) As List(Of String)
- Dim are(NumThreads - 1) As AutoResetEvent
- Dim tos(NumThreads - 1) As ThreadArgs
- Dim startindex As Integer = 0, numwords As Integer = WordList.Count \ NumThreads
- For i As Integer = 0 To NumThreads - 1
- l(i) = New List(Of String)
- are(i) = New AutoResetEvent(True)
- With tos(i)
- .SearchString = s
- .SrcArray = WordList
- .startIndex = startindex
- .endInex = Math.Min(.startIndex + numwords - 1, WordList.Length - 1)
- startindex += numwords
- .MaxDistanz = MaxDistanz
- .DstList = l(i)
- .ARE = are(i)
- .ARE.Reset()
- End With
- Dim t As New Thread(AddressOf GetLevenshteinListThread)
- t.Start(tos(i))
- Next
- System.Threading.WaitHandle.WaitAll(are)
- GetThreadedLevenshteinList = New List(Of String)
- For i As Integer = 0 To NumThreads - 1
- For Each st As String In tos(i).DstList
- GetThreadedLevenshteinList.Add(st)
- Next
- Next
- Return GetThreadedLevenshteinList
- End Function
- Private Shared Sub GetLevenshteinListThread(ByVal o As Object)
- Dim tos As ThreadArgs = DirectCast(o, ThreadArgs)
- Dim crntl As Integer
- Dim wort As String = String.Empty
- For i As Integer = tos.startIndex To tos.endInex
- crntl = Distanz(tos.SearchString, tos.SrcArray(i))
- If crntl < tos.MaxDistanz Then
- tos.DstList.Add(tos.SrcArray(i))
- End If
- Next
- tos.ARE.Set()
- End Sub
- Public Shared Function GetLevenshteinList(ByVal s As String, ByVal MaxDistanz As Integer) As List(Of String)
- Dim crntl As Integer
- Dim wort As String = String.Empty
- Dim l As New List(Of String)
- For Each t As String In WordList
- crntl = Distanz(s, t)
- If crntl < MaxDistanz Then
- l.Add(t)
- End If
- Next
- Return l
- End Function
- ''' <summary>
- ''' Berechnet die Levenshtein Distanz zweier Strings.
- ''' Dies ist quasi ein Maß der Ähnlichkeit
- ''' </summary>
- ''' <param name="s">String 1</param>
- ''' <param name="t">String 2</param>
- ''' <returns>Levenshtein Distanz</returns>
- ''' <remarks>C# Sample nach VB portiert
- ''' Original: http://www.merriampark.com/ldcsharp.htm
- ''' </remarks>
- Public Shared Function Distanz(ByVal s As String, ByVal t As String) As Integer
- Dim n As Integer = s.Length
- Dim m As Integer = t.Length
- Dim d(n + 1, m + 1) As Integer ' matrix
- Dim cost As Integer
- ' Step 1
- If n = 0 Then Return m
- If m = 0 Then Return n
- ' Step 2
- For i As Integer = 0 To n
- d(i, 0) = i
- Next
- For j As Integer = 0 To m
- d(0, j) = j
- Next
- ' Step 3
- For i = 1 To n
- For j = 1 To m
- ' Kosten für eine Position:
- If t(j - 1) = s(i - 1) Then
- cost = 0
- Else
- cost = 1
- End If
- d(i, j) = System.Math.Min(System.Math.Min(d(i - 1, j) + 1, d(i, j - 1) + 1), d(i - 1, j - 1) + cost)
- Next j
- Next i
- Return d(n, m)
- End Function
- End Class
Testform:
VB.NET-Quellcode
- Option Strict On
- Imports System.ComponentModel
- Imports System.IO
- Public Class Form1
- Private Sub Form1_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
- ' wortliste von: wortschatz.uni-leipzig.de/Papers/top10000de.txt
- Levenshtein.GetWordList(Path.Combine(My.Computer.FileSystem.SpecialDirectories.MyDocuments, "top10000de.txt"))
- ' liste auf 320.000 aufpusten
- For i As Integer = 1 To 5
- Levenshtein.DoubleList()
- Next
- End Sub
- Private Delegate Sub dlgSetListbox(ByVal s As String)
- Private Sub SetListbox(ByVal s As String)
- ListBox1.Items.Add(s)
- End Sub
- Private Sub RunInOtherThread()
- Dim stp As New Stopwatch
- Dim l As List(Of String)
- stp.Start()
- l = Levenshtein.GetThreadedLevenshteinList("Haus", 2, 2)
- Dim ms As Long = stp.ElapsedMilliseconds
- Dim d As dlgSetListbox = AddressOf SetListbox
- ListBox1.Invoke(d, "Threaded: " & ms.ToString)
- End Sub
- Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
- ' Leider können wir die threaded funktion nicht aus dem Hauptthread aufrufen,
- ' weil der STAThread ist und dann geht WaitHandle.WaitAll nicht
- ' Also schmeissen wir mal nen dummy thread an ;)
- Dim t As New Threading.Thread(AddressOf RunInOtherThread)
- t.Start()
- End Sub
- Private Sub Button2_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button2.Click
- Dim stp As New Stopwatch
- Dim l As List(Of String)
- stp.Start()
- l = Levenshtein.GetLevenshteinList("Haus", 2)
- Dim ms As Long = stp.ElapsedMilliseconds
- ListBox1.Items.Add("Single: " & ms.ToString)
- End Sub
- End Class
Die singlethreaded Methode braucht bei mir (dual-core) im schnitt 1050 ms, die Multithread Methode im Schnitt 700 ms.
Wenn jemand das ganze evtl mal auf nem i920 mit aktiviertem HT ausprobieren könnte ...
(PS: Anzahl der threads dann natürlich auf 4 oder 8 setzen!)
Wortliste runterladen nicht vergessen!
Dieser Beitrag wurde bereits 3 mal editiert, zuletzt von „picoflop“ ()