GeoPandas: Tìm điểm gần nhất trong khung dữ liệu khác

20

Tôi đã có 2 geodataframes:

import geopandas as gpd
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John',1,Point(1,1)],['Smith',1,Point(2,2)],['Soap',1,Point(0,2)]],columns=['Name','ID','geometry'])
gpd2 = gpd.GeoDataFrame([['Work',Point(0,1.1)],['Shops',Point(2.5,2)],['Home',Point(1,1.1)]],columns=['Place','geometry'])

và tôi muốn tìm tên của điểm gần nhất trong gpd2 cho mỗi hàng trong gpd1:

desired_output = 

    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

Tôi đã cố gắng để làm việc này bằng cách sử dụng chức năng lambda:

gpd1['Nearest'] = gpd1.apply(lambda row: min_dist(row.geometry,gpd2)['Place'] , axis=1)

với

def min_dist(point, gpd2):

    geoseries = some_function()
    return geoseries

— Làm lại
nguồn

Phương pháp này hiệu quả với tôi: stackoverflow.com/questions/37402046/ Hãy nhìn vào liên kết

— Johnny Cheesecutter

16

Bạn có thể trực tiếp sử dụng chức năng Shapely Điểm gần nhất (hình học của GeoSeries là hình học Shapely):

from shapely.ops import nearest_points
# unary union of the gpd2 geomtries 
pts3 = gpd2.geometry.unary_union
def near(point, pts=pts3):
     # find the nearest point and return the corresponding Place value
     nearest = gpd2.geometry == nearest_points(point, pts)[1]
     return gpd2[nearest].Place.get_values()[0]
gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)
gpd1
    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

Giải thích

for i, row in gpd1.iterrows():
    print nearest_points(row.geometry, pts3)[0], nearest_points(row.geometry, pts3)[1]
 POINT (1 1) POINT (1 1.1)
 POINT (2 2) POINT (2.5 2)
 POINT (0 2) POINT (0 1.1)

— gen
nguồn

Một cái gì đó không làm việc cho tôi và tôi không thể tìm ra nó. Hàm trả về một GeoSeries trống mặc dù hình dạng là rắn. Ví dụ: sample_point = gpd2.geometry.unary_union[400] / sample_point in gpd2.geometry Điều này trả về True. gpd2.geometry == sample_point Điều này đi ra tất cả Sai.

— cướp

Ngoài ra: gpd2.geometry.geom_equals(sample_point)công trình.

— cướp

13

Nếu bạn có các tệp dữ liệu lớn, tôi đã thấy phương pháp scipychỉ mục không gian cKDTree đó .querytrả về kết quả rất nhanh cho các tìm kiếm lân cận gần nhất. Vì nó sử dụng một chỉ mục không gian, nên các lệnh có độ lớn nhanh hơn so với việc lặp qua khung dữ liệu và sau đó tìm tối thiểu của tất cả các khoảng cách. Nó cũng nhanh hơn so với việc sử dụng shapely nearest_pointsvới RTree (phương pháp chỉ mục không gian có sẵn thông qua geopandas) vì cKDTree cho phép bạn vector hóa tìm kiếm của mình trong khi phương pháp khác thì không.

Đây là một hàm trợ giúp sẽ trả về khoảng cách và 'Tên' của hàng xóm gần nhất gpd2từ mỗi điểm trong gpd1. Nó giả sử cả hai gdf đều có một geometrycột (điểm).

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)], ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', Point(0, 1.1)], ['Shops', Point(2.5, 2)],
                         ['Home', Point(1, 1.1)]],
                        columns=['Place', 'geometry'])

def ckdnearest(gdA, gdB):
    nA = np.array(list(zip(gdA.geometry.x, gdA.geometry.y)) )
    nB = np.array(list(zip(gdB.geometry.x, gdB.geometry.y)) )
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdf = pd.concat(
        [gdA, gdB.loc[idx, gdB.columns != 'geometry'].reset_index(),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

ckdnearest(gpd1, gpd2)

Và nếu bạn muốn tìm điểm gần nhất với LineString, đây là một ví dụ hoạt động đầy đủ:

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— JHuw
nguồn

Có thể đưa ra điểm gần nhất trên đường dây bằng cách sử dụng phương pháp này không? Ví dụ: để chụp vị trí GPS đến đường gần nhất.

— hyperjack

Câu trả lời này thật tuyệt vời! Tuy nhiên, mã cho các điểm gần nhất để tạo ra một lỗi cho tôi. Có vẻ như khoảng cách chính xác từ dòng gần nhất được trả về cho mỗi điểm, nhưng id dòng được trả về là sai. Tôi nghĩ đó là phép tính idx, nhưng tôi khá mới với Python, vì vậy tôi không thể xoay quanh nó.

— Shakesk

1

Tìm ra:

def min_dist(point, gpd2):
    gpd2['Dist'] = gpd2.apply(lambda row:  point.distance(row.geometry),axis=1)
    geoseries = gpd2.iloc[gpd2['Dist'].argmin()]
    return geoseries

Tất nhiên một số lời chỉ trích được chào đón. Tôi không phải là người thích tính toán lại gpd2 ['Dist'] cho mỗi hàng của gpd1 ...

— Làm lại
nguồn

1

Câu trả lời của Gene đã không làm việc cho tôi. Cuối cùng tôi phát hiện ra rằng gpd2.geometry.unary_union dẫn đến một hình học chỉ chứa khoảng 30.000 trong tổng số khoảng 150.000 điểm của tôi. Đối với bất kỳ ai khác gặp phải vấn đề tương tự, đây là cách tôi giải quyết nó:

    from shapely.ops import nearest_points
    from shapely.geometry import MultiPoint

    gpd2_pts_list = gpd2.geometry.tolist()
    gpd2_pts = MultiPoint(gpd2_pts_list)
    def nearest(point, gpd2_pts, gpd2=gpd2, geom_col='geometry', src_col='Place'):
         # find the nearest point
         nearest_point = nearest_points(point, gpd2_pts)[1]
         # return the corresponding value of the src_col of the nearest point
         value = gpd2[gpd2[geom_col] == nearest_point][src_col].get_values()[0]
         return value

    gpd1['Nearest'] = gpd1.apply(lambda x: nearest(x.geometry, gpd2_pts), axis=1)

— Inske
nguồn

0

Đối với bất kỳ ai có lỗi lập chỉ mục với dữ liệu của riêng họ trong khi sử dụng câu trả lời xuất sắc từ @ JHuw , vấn đề của tôi là các chỉ mục của tôi không phù hợp. Đặt lại chỉ mục của gdfA và gdfB đã giải quyết các vấn đề của tôi, có lẽ điều này cũng có thể giúp bạn @ Shakesk .

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    # resetting the index of gdfA and gdfB here.
    gdfA = gdfA.reset_index(drop=True)
    gdfB = gdfB.reset_index(drop=True)
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— Markus Rosenfelder
nguồn